In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re

warnings.filterwarnings('ignore')
%matplotlib inline
In [2]:
pd.set_option("display.max_columns",250)
pd.set_option("display.max_rows",250)
pd.set_option("display.width",2000)
In [3]:
previous_application=pd.read_csv('E:/VCET/Trainity Intern/Project 6 Bank Loan Case Study/previous_application.csv')
    application_data=pd.read_csv('E:/VCET/Trainity Intern/Project 6 Bank Loan Case Study/application_data.csv')
In [4]:
previous_application.shape
Out[4]:
(1670214, 37)
In [5]:
application_data.shape
Out[5]:
(307511, 122)
In [6]:
previous_application.info(verbose=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1670214 entries, 0 to 1670213
Data columns (total 37 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   SK_ID_PREV                   1670214 non-null  int64  
 1   SK_ID_CURR                   1670214 non-null  int64  
 2   NAME_CONTRACT_TYPE           1670214 non-null  object 
 3   AMT_ANNUITY                  1297979 non-null  float64
 4   AMT_APPLICATION              1670214 non-null  float64
 5   AMT_CREDIT                   1670213 non-null  float64
 6   AMT_DOWN_PAYMENT             774370 non-null   float64
 7   AMT_GOODS_PRICE              1284699 non-null  float64
 8   WEEKDAY_APPR_PROCESS_START   1670214 non-null  object 
 9   HOUR_APPR_PROCESS_START      1670214 non-null  int64  
 10  FLAG_LAST_APPL_PER_CONTRACT  1670214 non-null  object 
 11  NFLAG_LAST_APPL_IN_DAY       1670214 non-null  int64  
 12  RATE_DOWN_PAYMENT            774370 non-null   float64
 13  RATE_INTEREST_PRIMARY        5951 non-null     float64
 14  RATE_INTEREST_PRIVILEGED     5951 non-null     float64
 15  NAME_CASH_LOAN_PURPOSE       1670214 non-null  object 
 16  NAME_CONTRACT_STATUS         1670214 non-null  object 
 17  DAYS_DECISION                1670214 non-null  int64  
 18  NAME_PAYMENT_TYPE            1670214 non-null  object 
 19  CODE_REJECT_REASON           1670214 non-null  object 
 20  NAME_TYPE_SUITE              849809 non-null   object 
 21  NAME_CLIENT_TYPE             1670214 non-null  object 
 22  NAME_GOODS_CATEGORY          1670214 non-null  object 
 23  NAME_PORTFOLIO               1670214 non-null  object 
 24  NAME_PRODUCT_TYPE            1670214 non-null  object 
 25  CHANNEL_TYPE                 1670214 non-null  object 
 26  SELLERPLACE_AREA             1670214 non-null  int64  
 27  NAME_SELLER_INDUSTRY         1670214 non-null  object 
 28  CNT_PAYMENT                  1297984 non-null  float64
 29  NAME_YIELD_GROUP             1670214 non-null  object 
 30  PRODUCT_COMBINATION          1669868 non-null  object 
 31  DAYS_FIRST_DRAWING           997149 non-null   float64
 32  DAYS_FIRST_DUE               997149 non-null   float64
 33  DAYS_LAST_DUE_1ST_VERSION    997149 non-null   float64
 34  DAYS_LAST_DUE                997149 non-null   float64
 35  DAYS_TERMINATION             997149 non-null   float64
 36  NFLAG_INSURED_ON_APPROVAL    997149 non-null   float64
dtypes: float64(15), int64(6), object(16)
memory usage: 471.5+ MB
In [7]:
application_data.info(verbose=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 122 columns):
 #    Column                        Dtype  
---   ------                        -----  
 0    SK_ID_CURR                    int64  
 1    TARGET                        int64  
 2    NAME_CONTRACT_TYPE            object 
 3    CODE_GENDER                   object 
 4    FLAG_OWN_CAR                  object 
 5    FLAG_OWN_REALTY               object 
 6    CNT_CHILDREN                  int64  
 7    AMT_INCOME_TOTAL              float64
 8    AMT_CREDIT                    float64
 9    AMT_ANNUITY                   float64
 10   AMT_GOODS_PRICE               float64
 11   NAME_TYPE_SUITE               object 
 12   NAME_INCOME_TYPE              object 
 13   NAME_EDUCATION_TYPE           object 
 14   NAME_FAMILY_STATUS            object 
 15   NAME_HOUSING_TYPE             object 
 16   REGION_POPULATION_RELATIVE    float64
 17   DAYS_BIRTH                    int64  
 18   DAYS_EMPLOYED                 int64  
 19   DAYS_REGISTRATION             float64
 20   DAYS_ID_PUBLISH               int64  
 21   OWN_CAR_AGE                   float64
 22   FLAG_MOBIL                    int64  
 23   FLAG_EMP_PHONE                int64  
 24   FLAG_WORK_PHONE               int64  
 25   FLAG_CONT_MOBILE              int64  
 26   FLAG_PHONE                    int64  
 27   FLAG_EMAIL                    int64  
 28   OCCUPATION_TYPE               object 
 29   CNT_FAM_MEMBERS               float64
 30   REGION_RATING_CLIENT          int64  
 31   REGION_RATING_CLIENT_W_CITY   int64  
 32   WEEKDAY_APPR_PROCESS_START    object 
 33   HOUR_APPR_PROCESS_START       int64  
 34   REG_REGION_NOT_LIVE_REGION    int64  
 35   REG_REGION_NOT_WORK_REGION    int64  
 36   LIVE_REGION_NOT_WORK_REGION   int64  
 37   REG_CITY_NOT_LIVE_CITY        int64  
 38   REG_CITY_NOT_WORK_CITY        int64  
 39   LIVE_CITY_NOT_WORK_CITY       int64  
 40   ORGANIZATION_TYPE             object 
 41   EXT_SOURCE_1                  float64
 42   EXT_SOURCE_2                  float64
 43   EXT_SOURCE_3                  float64
 44   APARTMENTS_AVG                float64
 45   BASEMENTAREA_AVG              float64
 46   YEARS_BEGINEXPLUATATION_AVG   float64
 47   YEARS_BUILD_AVG               float64
 48   COMMONAREA_AVG                float64
 49   ELEVATORS_AVG                 float64
 50   ENTRANCES_AVG                 float64
 51   FLOORSMAX_AVG                 float64
 52   FLOORSMIN_AVG                 float64
 53   LANDAREA_AVG                  float64
 54   LIVINGAPARTMENTS_AVG          float64
 55   LIVINGAREA_AVG                float64
 56   NONLIVINGAPARTMENTS_AVG       float64
 57   NONLIVINGAREA_AVG             float64
 58   APARTMENTS_MODE               float64
 59   BASEMENTAREA_MODE             float64
 60   YEARS_BEGINEXPLUATATION_MODE  float64
 61   YEARS_BUILD_MODE              float64
 62   COMMONAREA_MODE               float64
 63   ELEVATORS_MODE                float64
 64   ENTRANCES_MODE                float64
 65   FLOORSMAX_MODE                float64
 66   FLOORSMIN_MODE                float64
 67   LANDAREA_MODE                 float64
 68   LIVINGAPARTMENTS_MODE         float64
 69   LIVINGAREA_MODE               float64
 70   NONLIVINGAPARTMENTS_MODE      float64
 71   NONLIVINGAREA_MODE            float64
 72   APARTMENTS_MEDI               float64
 73   BASEMENTAREA_MEDI             float64
 74   YEARS_BEGINEXPLUATATION_MEDI  float64
 75   YEARS_BUILD_MEDI              float64
 76   COMMONAREA_MEDI               float64
 77   ELEVATORS_MEDI                float64
 78   ENTRANCES_MEDI                float64
 79   FLOORSMAX_MEDI                float64
 80   FLOORSMIN_MEDI                float64
 81   LANDAREA_MEDI                 float64
 82   LIVINGAPARTMENTS_MEDI         float64
 83   LIVINGAREA_MEDI               float64
 84   NONLIVINGAPARTMENTS_MEDI      float64
 85   NONLIVINGAREA_MEDI            float64
 86   FONDKAPREMONT_MODE            object 
 87   HOUSETYPE_MODE                object 
 88   TOTALAREA_MODE                float64
 89   WALLSMATERIAL_MODE            object 
 90   EMERGENCYSTATE_MODE           object 
 91   OBS_30_CNT_SOCIAL_CIRCLE      float64
 92   DEF_30_CNT_SOCIAL_CIRCLE      float64
 93   OBS_60_CNT_SOCIAL_CIRCLE      float64
 94   DEF_60_CNT_SOCIAL_CIRCLE      float64
 95   DAYS_LAST_PHONE_CHANGE        float64
 96   FLAG_DOCUMENT_2               int64  
 97   FLAG_DOCUMENT_3               int64  
 98   FLAG_DOCUMENT_4               int64  
 99   FLAG_DOCUMENT_5               int64  
 100  FLAG_DOCUMENT_6               int64  
 101  FLAG_DOCUMENT_7               int64  
 102  FLAG_DOCUMENT_8               int64  
 103  FLAG_DOCUMENT_9               int64  
 104  FLAG_DOCUMENT_10              int64  
 105  FLAG_DOCUMENT_11              int64  
 106  FLAG_DOCUMENT_12              int64  
 107  FLAG_DOCUMENT_13              int64  
 108  FLAG_DOCUMENT_14              int64  
 109  FLAG_DOCUMENT_15              int64  
 110  FLAG_DOCUMENT_16              int64  
 111  FLAG_DOCUMENT_17              int64  
 112  FLAG_DOCUMENT_18              int64  
 113  FLAG_DOCUMENT_19              int64  
 114  FLAG_DOCUMENT_20              int64  
 115  FLAG_DOCUMENT_21              int64  
 116  AMT_REQ_CREDIT_BUREAU_HOUR    float64
 117  AMT_REQ_CREDIT_BUREAU_DAY     float64
 118  AMT_REQ_CREDIT_BUREAU_WEEK    float64
 119  AMT_REQ_CREDIT_BUREAU_MON     float64
 120  AMT_REQ_CREDIT_BUREAU_QRT     float64
 121  AMT_REQ_CREDIT_BUREAU_YEAR    float64
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB
In [8]:
previous_application.describe()
Out[8]:
SK_ID_PREV SK_ID_CURR AMT_ANNUITY AMT_APPLICATION AMT_CREDIT AMT_DOWN_PAYMENT AMT_GOODS_PRICE HOUR_APPR_PROCESS_START NFLAG_LAST_APPL_IN_DAY RATE_DOWN_PAYMENT RATE_INTEREST_PRIMARY RATE_INTEREST_PRIVILEGED DAYS_DECISION SELLERPLACE_AREA CNT_PAYMENT DAYS_FIRST_DRAWING DAYS_FIRST_DUE DAYS_LAST_DUE_1ST_VERSION DAYS_LAST_DUE DAYS_TERMINATION NFLAG_INSURED_ON_APPROVAL
count 1.670214e+06 1.670214e+06 1.297979e+06 1.670214e+06 1.670213e+06 7.743700e+05 1.284699e+06 1.670214e+06 1.670214e+06 774370.000000 5951.000000 5951.000000 1.670214e+06 1.670214e+06 1.297984e+06 997149.000000 997149.000000 997149.000000 997149.000000 997149.000000 997149.000000
mean 1.923089e+06 2.783572e+05 1.595512e+04 1.752339e+05 1.961140e+05 6.697402e+03 2.278473e+05 1.248418e+01 9.964675e-01 0.079637 0.188357 0.773503 -8.806797e+02 3.139511e+02 1.605408e+01 342209.855039 13826.269337 33767.774054 76582.403064 81992.343838 0.332570
std 5.325980e+05 1.028148e+05 1.478214e+04 2.927798e+05 3.185746e+05 2.092150e+04 3.153966e+05 3.334028e+00 5.932963e-02 0.107823 0.087671 0.100879 7.790997e+02 7.127443e+03 1.456729e+01 88916.115834 72444.869708 106857.034789 149647.415123 153303.516729 0.471134
min 1.000001e+06 1.000010e+05 0.000000e+00 0.000000e+00 0.000000e+00 -9.000000e-01 0.000000e+00 0.000000e+00 0.000000e+00 -0.000015 0.034781 0.373150 -2.922000e+03 -1.000000e+00 0.000000e+00 -2922.000000 -2892.000000 -2801.000000 -2889.000000 -2874.000000 0.000000
25% 1.461857e+06 1.893290e+05 6.321780e+03 1.872000e+04 2.416050e+04 0.000000e+00 5.084100e+04 1.000000e+01 1.000000e+00 0.000000 0.160716 0.715645 -1.300000e+03 -1.000000e+00 6.000000e+00 365243.000000 -1628.000000 -1242.000000 -1314.000000 -1270.000000 0.000000
50% 1.923110e+06 2.787145e+05 1.125000e+04 7.104600e+04 8.054100e+04 1.638000e+03 1.123200e+05 1.200000e+01 1.000000e+00 0.051605 0.189122 0.835095 -5.810000e+02 3.000000e+00 1.200000e+01 365243.000000 -831.000000 -361.000000 -537.000000 -499.000000 0.000000
75% 2.384280e+06 3.675140e+05 2.065842e+04 1.803600e+05 2.164185e+05 7.740000e+03 2.340000e+05 1.500000e+01 1.000000e+00 0.108909 0.193330 0.852537 -2.800000e+02 8.200000e+01 2.400000e+01 365243.000000 -411.000000 129.000000 -74.000000 -44.000000 1.000000
max 2.845382e+06 4.562550e+05 4.180581e+05 6.905160e+06 6.905160e+06 3.060045e+06 6.905160e+06 2.300000e+01 1.000000e+00 1.000000 1.000000 1.000000 -1.000000e+00 4.000000e+06 8.400000e+01 365243.000000 365243.000000 365243.000000 365243.000000 365243.000000 1.000000
In [9]:
application_data.describe()
Out[9]:
SK_ID_CURR TARGET CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI TOTALAREA_MODE OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
count 307511.000000 307511.000000 307511.000000 3.075110e+05 3.075110e+05 307499.000000 3.072330e+05 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 104582.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 307509.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 134133.000000 3.068510e+05 246546.000000 151450.00000 127568.000000 157504.000000 103023.000000 92646.000000 143620.000000 152683.000000 154491.000000 98869.000000 124921.000000 97312.000000 153161.000000 93997.000000 137829.000000 151450.000000 127568.000000 157504.000000 103023.000000 92646.000000 143620.000000 152683.000000 154491.000000 98869.000000 124921.000000 97312.000000 153161.000000 93997.000000 137829.000000 151450.000000 127568.000000 157504.000000 103023.000000 92646.000000 143620.000000 152683.000000 154491.000000 98869.000000 124921.000000 97312.000000 153161.000000 93997.000000 137829.000000 159080.000000 306490.000000 306490.000000 306490.000000 306490.000000 307510.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.00000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 307511.000000 265992.000000 265992.000000 265992.000000 265992.000000 265992.000000 265992.000000
mean 278180.518577 0.080729 0.417052 1.687979e+05 5.990260e+05 27108.573909 5.383962e+05 0.020868 -16036.995067 63815.045904 -4986.120328 -2994.202373 12.061091 0.999997 0.819889 0.199368 0.998133 0.281066 0.056720 2.152665 2.052463 2.031521 12.063419 0.015144 0.050769 0.040659 0.078173 0.230454 0.179555 0.502130 5.143927e-01 0.510853 0.11744 0.088442 0.977735 0.752471 0.044621 0.078942 0.149725 0.226282 0.231894 0.066333 0.100775 0.107399 0.008809 0.028358 0.114231 0.087543 0.977065 0.759637 0.042553 0.074490 0.145193 0.222315 0.228058 0.064958 0.105645 0.105975 0.008076 0.027022 0.117850 0.087955 0.977752 0.755746 0.044595 0.078078 0.149213 0.225897 0.231625 0.067169 0.101954 0.108607 0.008651 0.028236 0.102547 1.422245 0.143421 1.405292 0.100049 -962.858788 0.000042 0.710023 0.000081 0.015115 0.088055 0.000192 0.081376 0.003896 0.000023 0.003912 0.000007 0.003525 0.002936 0.00121 0.009928 0.000267 0.008130 0.000595 0.000507 0.000335 0.006402 0.007000 0.034362 0.267395 0.265474 1.899974
std 102790.175348 0.272419 0.722121 2.371231e+05 4.024908e+05 14493.737315 3.694465e+05 0.013831 4363.988632 141275.766519 3522.886321 1509.450419 11.944812 0.001803 0.384280 0.399526 0.043164 0.449521 0.231307 0.910682 0.509034 0.502737 3.265832 0.122126 0.219526 0.197499 0.268444 0.421124 0.383817 0.211062 1.910602e-01 0.194844 0.10824 0.082438 0.059223 0.113280 0.076036 0.134576 0.100049 0.144641 0.161380 0.081184 0.092576 0.110565 0.047732 0.069523 0.107936 0.084307 0.064575 0.110111 0.074445 0.132256 0.100977 0.143709 0.161160 0.081750 0.097880 0.111845 0.046276 0.070254 0.109076 0.082179 0.059897 0.112066 0.076144 0.134467 0.100368 0.145067 0.161934 0.082167 0.093642 0.112260 0.047415 0.070166 0.107462 2.400989 0.446698 2.379803 0.362291 826.808487 0.006502 0.453752 0.009016 0.122010 0.283376 0.013850 0.273412 0.062295 0.004771 0.062424 0.002550 0.059268 0.054110 0.03476 0.099144 0.016327 0.089798 0.024387 0.022518 0.018299 0.083849 0.110757 0.204685 0.916002 0.794056 1.869295
min 100002.000000 0.000000 0.000000 2.565000e+04 4.500000e+04 1615.500000 4.050000e+04 0.000290 -25229.000000 -17912.000000 -24672.000000 -7197.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.014568 8.173617e-08 0.000527 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -4292.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 189145.500000 0.000000 0.000000 1.125000e+05 2.700000e+05 16524.000000 2.385000e+05 0.010006 -19682.000000 -2760.000000 -7479.500000 -4299.000000 5.000000 1.000000 1.000000 0.000000 1.000000 0.000000 0.000000 2.000000 2.000000 2.000000 10.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.334007 3.924574e-01 0.370650 0.05770 0.044200 0.976700 0.687200 0.007800 0.000000 0.069000 0.166700 0.083300 0.018700 0.050400 0.045300 0.000000 0.000000 0.052500 0.040700 0.976700 0.699400 0.007200 0.000000 0.069000 0.166700 0.083300 0.016600 0.054200 0.042700 0.000000 0.000000 0.058300 0.043700 0.976700 0.691400 0.007900 0.000000 0.069000 0.166700 0.083300 0.018700 0.051300 0.045700 0.000000 0.000000 0.041200 0.000000 0.000000 0.000000 0.000000 -1570.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 278202.000000 0.000000 0.000000 1.471500e+05 5.135310e+05 24903.000000 4.500000e+05 0.018850 -15750.000000 -1213.000000 -4504.000000 -3254.000000 9.000000 1.000000 1.000000 0.000000 1.000000 0.000000 0.000000 2.000000 2.000000 2.000000 12.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.505998 5.659614e-01 0.535276 0.08760 0.076300 0.981600 0.755200 0.021100 0.000000 0.137900 0.166700 0.208300 0.048100 0.075600 0.074500 0.000000 0.003600 0.084000 0.074600 0.981600 0.764800 0.019000 0.000000 0.137900 0.166700 0.208300 0.045800 0.077100 0.073100 0.000000 0.001100 0.086400 0.075800 0.981600 0.758500 0.020800 0.000000 0.137900 0.166700 0.208300 0.048700 0.076100 0.074900 0.000000 0.003100 0.068800 0.000000 0.000000 0.000000 0.000000 -757.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
75% 367142.500000 0.000000 1.000000 2.025000e+05 8.086500e+05 34596.000000 6.795000e+05 0.028663 -12413.000000 -289.000000 -2010.000000 -1720.000000 15.000000 1.000000 1.000000 0.000000 1.000000 1.000000 0.000000 3.000000 2.000000 2.000000 14.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.675053 6.636171e-01 0.669057 0.14850 0.112200 0.986600 0.823200 0.051500 0.120000 0.206900 0.333300 0.375000 0.085600 0.121000 0.129900 0.003900 0.027700 0.143900 0.112400 0.986600 0.823600 0.049000 0.120800 0.206900 0.333300 0.375000 0.084100 0.131300 0.125200 0.003900 0.023100 0.148900 0.111600 0.986600 0.825600 0.051300 0.120000 0.206900 0.333300 0.375000 0.086800 0.123100 0.130300 0.003900 0.026600 0.127600 2.000000 0.000000 2.000000 0.000000 -274.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.000000
max 456255.000000 1.000000 19.000000 1.170000e+08 4.050000e+06 258025.500000 4.050000e+06 0.072508 -7489.000000 365243.000000 0.000000 0.000000 91.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20.000000 3.000000 3.000000 23.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.962693 8.549997e-01 0.896010 1.00000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 348.000000 34.000000 344.000000 24.000000 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.00000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 4.000000 9.000000 8.000000 27.000000 261.000000 25.000000
In [10]:
previous_application.agg(['count','size','nunique'])
Out[10]:
SK_ID_PREV SK_ID_CURR NAME_CONTRACT_TYPE AMT_ANNUITY AMT_APPLICATION AMT_CREDIT AMT_DOWN_PAYMENT AMT_GOODS_PRICE WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START FLAG_LAST_APPL_PER_CONTRACT NFLAG_LAST_APPL_IN_DAY RATE_DOWN_PAYMENT RATE_INTEREST_PRIMARY RATE_INTEREST_PRIVILEGED NAME_CASH_LOAN_PURPOSE NAME_CONTRACT_STATUS DAYS_DECISION NAME_PAYMENT_TYPE CODE_REJECT_REASON NAME_TYPE_SUITE NAME_CLIENT_TYPE NAME_GOODS_CATEGORY NAME_PORTFOLIO NAME_PRODUCT_TYPE CHANNEL_TYPE SELLERPLACE_AREA NAME_SELLER_INDUSTRY CNT_PAYMENT NAME_YIELD_GROUP PRODUCT_COMBINATION DAYS_FIRST_DRAWING DAYS_FIRST_DUE DAYS_LAST_DUE_1ST_VERSION DAYS_LAST_DUE DAYS_TERMINATION NFLAG_INSURED_ON_APPROVAL
count 1670214 1670214 1670214 1297979 1670214 1670213 774370 1284699 1670214 1670214 1670214 1670214 774370 5951 5951 1670214 1670214 1670214 1670214 1670214 849809 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1297984 1670214 1669868 997149 997149 997149 997149 997149 997149
size 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214 1670214
nunique 1670214 338857 4 357959 93885 86803 29278 93885 7 24 2 2 207033 148 25 25 4 2922 4 9 7 4 28 5 3 8 2097 11 49 5 17 2838 2892 4605 2873 2830 2
In [11]:
application_data.agg(['count','size','nunique'])
Out[11]:
SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE NAME_TYPE_SUITE NAME_INCOME_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY ORGANIZATION_TYPE EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI FONDKAPREMONT_MODE HOUSETYPE_MODE TOTALAREA_MODE WALLSMATERIAL_MODE EMERGENCYSTATE_MODE OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
count 307511 307511 307511 307511 307511 307511 307511 307511 307511 307499 307233 306219 307511 307511 307511 307511 307511 307511 307511 307511 307511 104582 307511 307511 307511 307511 307511 307511 211120 307509 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 134133 306851 246546 151450 127568 157504 103023 92646 143620 152683 154491 98869 124921 97312 153161 93997 137829 151450 127568 157504 103023 92646 143620 152683 154491 98869 124921 97312 153161 93997 137829 151450 127568 157504 103023 92646 143620 152683 154491 98869 124921 97312 153161 93997 137829 97216 153214 159080 151170 161756 306490 306490 306490 306490 307510 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 265992 265992 265992 265992 265992 265992
size 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511 307511
nunique 307511 2 2 3 2 2 15 2548 5603 13672 1002 7 8 5 6 6 81 17460 12574 15688 6168 62 2 2 2 2 2 2 18 17 3 3 7 24 2 2 2 2 2 2 58 114584 119831 814 2339 3780 285 149 3181 257 285 403 305 3527 1868 5199 386 3290 760 3841 221 154 3128 26 30 25 25 3563 736 5301 167 3327 1148 3772 245 151 3202 46 46 49 47 3560 1097 5281 214 3323 4 3 5116 7 2 33 10 33 9 3773 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 5 9 9 24 11 25
In [12]:
round(previous_application.isnull().sum()/previous_application.shape[0]*100,2).sort_values(ascending=False)
Out[12]:
RATE_INTEREST_PRIVILEGED       99.64
RATE_INTEREST_PRIMARY          99.64
RATE_DOWN_PAYMENT              53.64
AMT_DOWN_PAYMENT               53.64
NAME_TYPE_SUITE                49.12
NFLAG_INSURED_ON_APPROVAL      40.30
DAYS_FIRST_DRAWING             40.30
DAYS_FIRST_DUE                 40.30
DAYS_LAST_DUE_1ST_VERSION      40.30
DAYS_LAST_DUE                  40.30
DAYS_TERMINATION               40.30
AMT_GOODS_PRICE                23.08
AMT_ANNUITY                    22.29
CNT_PAYMENT                    22.29
PRODUCT_COMBINATION             0.02
CHANNEL_TYPE                    0.00
NAME_PRODUCT_TYPE               0.00
NAME_YIELD_GROUP                0.00
SELLERPLACE_AREA                0.00
NAME_SELLER_INDUSTRY            0.00
NAME_GOODS_CATEGORY             0.00
NAME_PORTFOLIO                  0.00
SK_ID_PREV                      0.00
NAME_CLIENT_TYPE                0.00
CODE_REJECT_REASON              0.00
SK_ID_CURR                      0.00
DAYS_DECISION                   0.00
NAME_CONTRACT_STATUS            0.00
NAME_CASH_LOAN_PURPOSE          0.00
NFLAG_LAST_APPL_IN_DAY          0.00
FLAG_LAST_APPL_PER_CONTRACT     0.00
HOUR_APPR_PROCESS_START         0.00
WEEKDAY_APPR_PROCESS_START      0.00
AMT_CREDIT                      0.00
AMT_APPLICATION                 0.00
NAME_CONTRACT_TYPE              0.00
NAME_PAYMENT_TYPE               0.00
dtype: float64
In [18]:
deleted_column.shape
Out[18]:
(307511, 49)
In [19]:
deleted_column.columns
Out[19]:
Index(['OWN_CAR_AGE', 'EXT_SOURCE_1', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'TOTALAREA_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'], dtype='object')
In [20]:
round(application_data_up.isnull().sum()/application_data_up.shape[0]*100,2).sort_values(ascending=False)
Out[20]:
OCCUPATION_TYPE                31.35
EXT_SOURCE_3                   19.83
AMT_REQ_CREDIT_BUREAU_YEAR     13.50
AMT_REQ_CREDIT_BUREAU_QRT      13.50
AMT_REQ_CREDIT_BUREAU_MON      13.50
AMT_REQ_CREDIT_BUREAU_WEEK     13.50
AMT_REQ_CREDIT_BUREAU_DAY      13.50
AMT_REQ_CREDIT_BUREAU_HOUR     13.50
NAME_TYPE_SUITE                 0.42
OBS_30_CNT_SOCIAL_CIRCLE        0.33
DEF_30_CNT_SOCIAL_CIRCLE        0.33
OBS_60_CNT_SOCIAL_CIRCLE        0.33
DEF_60_CNT_SOCIAL_CIRCLE        0.33
EXT_SOURCE_2                    0.21
AMT_GOODS_PRICE                 0.09
FLAG_DOCUMENT_7                 0.00
DAYS_LAST_PHONE_CHANGE          0.00
FLAG_DOCUMENT_2                 0.00
FLAG_DOCUMENT_3                 0.00
FLAG_DOCUMENT_4                 0.00
FLAG_DOCUMENT_5                 0.00
FLAG_DOCUMENT_6                 0.00
FLAG_DOCUMENT_11                0.00
FLAG_DOCUMENT_8                 0.00
FLAG_DOCUMENT_9                 0.00
FLAG_DOCUMENT_10                0.00
LIVE_CITY_NOT_WORK_CITY         0.00
FLAG_DOCUMENT_12                0.00
FLAG_DOCUMENT_13                0.00
FLAG_DOCUMENT_14                0.00
FLAG_DOCUMENT_15                0.00
FLAG_DOCUMENT_16                0.00
FLAG_DOCUMENT_17                0.00
FLAG_DOCUMENT_18                0.00
FLAG_DOCUMENT_19                0.00
FLAG_DOCUMENT_20                0.00
FLAG_DOCUMENT_21                0.00
ORGANIZATION_TYPE               0.00
SK_ID_CURR                      0.00
REG_CITY_NOT_WORK_CITY          0.00
AMT_ANNUITY                     0.00
DAYS_BIRTH                      0.00
REGION_POPULATION_RELATIVE      0.00
NAME_HOUSING_TYPE               0.00
NAME_FAMILY_STATUS              0.00
NAME_EDUCATION_TYPE             0.00
NAME_INCOME_TYPE                0.00
AMT_CREDIT                      0.00
TARGET                          0.00
AMT_INCOME_TOTAL                0.00
CNT_CHILDREN                    0.00
FLAG_OWN_REALTY                 0.00
FLAG_OWN_CAR                    0.00
CODE_GENDER                     0.00
NAME_CONTRACT_TYPE              0.00
DAYS_EMPLOYED                   0.00
DAYS_REGISTRATION               0.00
DAYS_ID_PUBLISH                 0.00
FLAG_MOBIL                      0.00
FLAG_EMP_PHONE                  0.00
FLAG_WORK_PHONE                 0.00
FLAG_CONT_MOBILE                0.00
FLAG_PHONE                      0.00
FLAG_EMAIL                      0.00
CNT_FAM_MEMBERS                 0.00
REGION_RATING_CLIENT            0.00
REGION_RATING_CLIENT_W_CITY     0.00
WEEKDAY_APPR_PROCESS_START      0.00
HOUR_APPR_PROCESS_START         0.00
REG_REGION_NOT_LIVE_REGION      0.00
REG_REGION_NOT_WORK_REGION      0.00
LIVE_REGION_NOT_WORK_REGION     0.00
REG_CITY_NOT_LIVE_CITY          0.00
dtype: float64
In [21]:
application_data_up["OCCUPATION_TYPE"].unique()
Out[21]:
array(['Laborers', 'Core staff', 'Accountants', 'Managers', nan,
       'Drivers', 'Sales staff', 'Cleaning staff', 'Cooking staff',
       'Private service staff', 'Medicine staff', 'Security staff',
       'High skill tech staff', 'Waiters/barmen staff',
       'Low-skill Laborers', 'Realty agents', 'Secretaries', 'IT staff',
       'HR staff'], dtype=object)
In [22]:
application_data_up["OCCUPATION_TYPE"].value_counts()
Out[22]:
Laborers                 55186
Sales staff              32102
Core staff               27570
Managers                 21371
Drivers                  18603
High skill tech staff    11380
Accountants               9813
Medicine staff            8537
Security staff            6721
Cooking staff             5946
Cleaning staff            4653
Private service staff     2652
Low-skill Laborers        2093
Waiters/barmen staff      1348
Secretaries               1305
Realty agents              751
HR staff                   563
IT staff                   526
Name: OCCUPATION_TYPE, dtype: int64
In [23]:
application_data_up["OCCUPATION_TYPE"].isnull().sum()
Out[23]:
96391
In [24]:
application_data_up["OCCUPATION_TYPE"].replace(np.NaN, "unknown", inplace=True)
In [25]:
application_data_up["OCCUPATION_TYPE"].isnull().sum()
Out[25]:
0
In [26]:
application_data_up["OCCUPATION_TYPE"].value_counts()
Out[26]:
unknown                  96391
Laborers                 55186
Sales staff              32102
Core staff               27570
Managers                 21371
Drivers                  18603
High skill tech staff    11380
Accountants               9813
Medicine staff            8537
Security staff            6721
Cooking staff             5946
Cleaning staff            4653
Private service staff     2652
Low-skill Laborers        2093
Waiters/barmen staff      1348
Secretaries               1305
Realty agents              751
HR staff                   563
IT staff                   526
Name: OCCUPATION_TYPE, dtype: int64
In [27]:
round(application_data_up.isnull().sum()/application_data_up.shape[0]*100,2).sort_values(ascending=False)
Out[27]:
EXT_SOURCE_3                   19.83
AMT_REQ_CREDIT_BUREAU_YEAR     13.50
AMT_REQ_CREDIT_BUREAU_QRT      13.50
AMT_REQ_CREDIT_BUREAU_MON      13.50
AMT_REQ_CREDIT_BUREAU_WEEK     13.50
AMT_REQ_CREDIT_BUREAU_DAY      13.50
AMT_REQ_CREDIT_BUREAU_HOUR     13.50
NAME_TYPE_SUITE                 0.42
OBS_30_CNT_SOCIAL_CIRCLE        0.33
DEF_30_CNT_SOCIAL_CIRCLE        0.33
OBS_60_CNT_SOCIAL_CIRCLE        0.33
DEF_60_CNT_SOCIAL_CIRCLE        0.33
EXT_SOURCE_2                    0.21
AMT_GOODS_PRICE                 0.09
FLAG_DOCUMENT_8                 0.00
DAYS_LAST_PHONE_CHANGE          0.00
FLAG_DOCUMENT_2                 0.00
FLAG_DOCUMENT_3                 0.00
FLAG_DOCUMENT_4                 0.00
FLAG_DOCUMENT_5                 0.00
FLAG_DOCUMENT_6                 0.00
FLAG_DOCUMENT_7                 0.00
FLAG_DOCUMENT_12                0.00
FLAG_DOCUMENT_9                 0.00
FLAG_DOCUMENT_10                0.00
FLAG_DOCUMENT_11                0.00
LIVE_CITY_NOT_WORK_CITY         0.00
FLAG_DOCUMENT_13                0.00
FLAG_DOCUMENT_14                0.00
FLAG_DOCUMENT_15                0.00
FLAG_DOCUMENT_16                0.00
FLAG_DOCUMENT_17                0.00
FLAG_DOCUMENT_18                0.00
FLAG_DOCUMENT_19                0.00
FLAG_DOCUMENT_20                0.00
FLAG_DOCUMENT_21                0.00
ORGANIZATION_TYPE               0.00
SK_ID_CURR                      0.00
REG_CITY_NOT_WORK_CITY          0.00
AMT_ANNUITY                     0.00
DAYS_BIRTH                      0.00
REGION_POPULATION_RELATIVE      0.00
NAME_HOUSING_TYPE               0.00
NAME_FAMILY_STATUS              0.00
NAME_EDUCATION_TYPE             0.00
NAME_INCOME_TYPE                0.00
AMT_CREDIT                      0.00
DAYS_REGISTRATION               0.00
AMT_INCOME_TOTAL                0.00
CNT_CHILDREN                    0.00
FLAG_OWN_REALTY                 0.00
FLAG_OWN_CAR                    0.00
CODE_GENDER                     0.00
NAME_CONTRACT_TYPE              0.00
DAYS_EMPLOYED                   0.00
DAYS_ID_PUBLISH                 0.00
TARGET                          0.00
REGION_RATING_CLIENT            0.00
LIVE_REGION_NOT_WORK_REGION     0.00
REG_REGION_NOT_WORK_REGION      0.00
REG_REGION_NOT_LIVE_REGION      0.00
HOUR_APPR_PROCESS_START         0.00
WEEKDAY_APPR_PROCESS_START      0.00
REGION_RATING_CLIENT_W_CITY     0.00
CNT_FAM_MEMBERS                 0.00
FLAG_MOBIL                      0.00
OCCUPATION_TYPE                 0.00
FLAG_EMAIL                      0.00
FLAG_PHONE                      0.00
FLAG_CONT_MOBILE                0.00
FLAG_WORK_PHONE                 0.00
FLAG_EMP_PHONE                  0.00
REG_CITY_NOT_LIVE_CITY          0.00
dtype: float64
In [28]:
not_required=['FLAG_DOCUMENT_21','FLAG_DOCUMENT_20','FLAG_DOCUMENT_19','FLAG_DOCUMENT_18','FLAG_DOCUMENT_17','FLAG_DOCUMENT_16',
             'FLAG_DOCUMENT_15','FLAG_DOCUMENT_14','FLAG_DOCUMENT_13','FLAG_DOCUMENT_12','FLAG_DOCUMENT_11',
             'FLAG_DOCUMENT_10','FLAG_DOCUMENT_9','FLAG_DOCUMENT_8','FLAG_DOCUMENT_7','FLAG_DOCUMENT_6','FLAG_DOCUMENT_5',
             'FLAG_DOCUMENT_4','FLAG_DOCUMENT_3','FLAG_DOCUMENT_2','OBS_30_CNT_SOCIAL_CIRCLE','DEF_30_CNT_SOCIAL_CIRCLE',
             'OBS_60_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE','AMT_REQ_CREDIT_BUREAU_YEAR','AMT_REQ_CREDIT_BUREAU_MON',
             'AMT_REQ_CREDIT_BUREAU_WEEK','AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_QRT']
In [29]:
application_data_up.drop(labels=not_required,axis=1,inplace=True)
In [30]:
application_data_up.shape
Out[30]:
(307511, 43)
In [31]:
round(application_data_up.isnull().sum()/application_data_up.shape[0]*100,2).sort_values(ascending=False)
Out[31]:
EXT_SOURCE_3                   19.83
NAME_TYPE_SUITE                 0.42
EXT_SOURCE_2                    0.21
AMT_GOODS_PRICE                 0.09
HOUR_APPR_PROCESS_START         0.00
FLAG_PHONE                      0.00
FLAG_EMAIL                      0.00
OCCUPATION_TYPE                 0.00
CNT_FAM_MEMBERS                 0.00
REGION_RATING_CLIENT            0.00
REGION_RATING_CLIENT_W_CITY     0.00
WEEKDAY_APPR_PROCESS_START      0.00
SK_ID_CURR                      0.00
REG_REGION_NOT_LIVE_REGION      0.00
FLAG_WORK_PHONE                 0.00
REG_REGION_NOT_WORK_REGION      0.00
LIVE_REGION_NOT_WORK_REGION     0.00
REG_CITY_NOT_LIVE_CITY          0.00
REG_CITY_NOT_WORK_CITY          0.00
LIVE_CITY_NOT_WORK_CITY         0.00
ORGANIZATION_TYPE               0.00
FLAG_CONT_MOBILE                0.00
FLAG_MOBIL                      0.00
FLAG_EMP_PHONE                  0.00
TARGET                          0.00
NAME_CONTRACT_TYPE              0.00
CODE_GENDER                     0.00
FLAG_OWN_CAR                    0.00
FLAG_OWN_REALTY                 0.00
CNT_CHILDREN                    0.00
AMT_INCOME_TOTAL                0.00
AMT_CREDIT                      0.00
AMT_ANNUITY                     0.00
NAME_INCOME_TYPE                0.00
NAME_EDUCATION_TYPE             0.00
NAME_FAMILY_STATUS              0.00
NAME_HOUSING_TYPE               0.00
REGION_POPULATION_RELATIVE      0.00
DAYS_BIRTH                      0.00
DAYS_EMPLOYED                   0.00
DAYS_REGISTRATION               0.00
DAYS_ID_PUBLISH                 0.00
DAYS_LAST_PHONE_CHANGE          0.00
dtype: float64
In [32]:
plt.figure(figsize=[30,10])
sns.barplot(x=application_data_up.OCCUPATION_TYPE.value_counts().index,
            y=application_data_up.OCCUPATION_TYPE.value_counts().values).set_title("OCCUPATION TYPE COUNTS",fontsize=40, color='blue',pad=30)
plt.xlabel('OCCUPATION TYPE', fontsize=40, color='blue')
plt.xticks(rotation=45)
plt.show()
In [33]:
plt.figure(figsize=[20,15])

sns.set_style('darkgrid')

plt.subplot(2,2,1)
sns.boxplot(application_data_up['EXT_SOURCE_2']).set_title("EXT_SOURCE_2",fontsize=30,color='blue',pad=30)

plt.subplot(2,2,2)
sns.boxplot(application_data_up['EXT_SOURCE_3']).set_title("EXT_SOURCE_3",fontsize=30,color='blue',pad=30)

plt.subplot(2,2,3)
sns.distplot(application_data_up['EXT_SOURCE_2'],color='b')

plt.subplot(2,2,4)
sns.distplot(application_data_up['EXT_SOURCE_3'],color='b')

plt.show()
In [34]:
#REPLACING THE MISSING VALUES WITH ITS CORRESPONDING MEDIAN
for column in ['EXT_SOURCE_2','EXT_SOURCE_3']:
    application_data_up[column].fillna(application_data_up[column].median(), inplace=True)
In [35]:
# CHECKING THE CORRELATION BETWEEN THE LOAN AMOUNT DEMANDED VS THE GOOD'S PRICE

sns.scatterplot(x=application_data_up['AMT_CREDIT'], y=application_data_up['AMT_GOODS_PRICE'], data=application_data_up)
plt.title("Correlation between the loan amount and the price of goods for which loan was given\n",
         fontdict={'fontsize':35,'fontweight':10,'color':'Blue'})
plt.xlabel("Loan Amount",fontdict={'fontsize':35,'fontweight':10,'color':'Black'})
plt.ylabel("Price of Goods",fontdict={'fontsize':35,'fontweight':10,'color':'Black'})
plt.show()
In [36]:
# Imputing the above mentioned logic 

application_data_up['AMT_GOODS_PRICE']=np.where(application_data_up['AMT_GOODS_PRICE'].isnull()==True,
                                      application_data_up['AMT_CREDIT'],application_data_up['AMT_GOODS_PRICE'])
In [37]:
application_data_up['AMT_GOODS_PRICE'].isnull().sum()
Out[37]:
0
In [38]:
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
In [39]:
#Percentage of each category

go.Figure(data=[go.Pie(labels=application_data_up.NAME_TYPE_SUITE.value_counts().index,
                       values=application_data_up.NAME_TYPE_SUITE.value_counts().values, hole=.5, title="NAME_TYPE_VALUE COUNTS",
                       pull=[0,0.1,0.1,0.1,0.1,0.1,0.1])])
In [40]:
application_data_up['NAME_TYPE_SUITE'].isnull().sum()/application_data_up.shape[0]*100
Out[40]:
0.42014757195677555
In [41]:
application_data_up['NAME_TYPE_SUITE'].isnull().sum()
Out[41]:
1292
In [42]:
application_data_up['NAME_TYPE_SUITE'].mode()
Out[42]:
0    Unaccompanied
Name: NAME_TYPE_SUITE, dtype: object
In [43]:
#Replacing missing values with MODE

application_data_up['NAME_TYPE_SUITE'].fillna(application_data_up['NAME_TYPE_SUITE'].mode()[0],inplace=True)
In [44]:
round(application_data_up.isnull().sum()/application_data_up.shape[0]*100,4).sort_values(ascending=False)
Out[44]:
AMT_ANNUITY                    0.0039
CNT_FAM_MEMBERS                0.0007
DAYS_LAST_PHONE_CHANGE         0.0003
HOUR_APPR_PROCESS_START        0.0000
FLAG_CONT_MOBILE               0.0000
FLAG_PHONE                     0.0000
FLAG_EMAIL                     0.0000
OCCUPATION_TYPE                0.0000
REGION_RATING_CLIENT           0.0000
REGION_RATING_CLIENT_W_CITY    0.0000
WEEKDAY_APPR_PROCESS_START     0.0000
REG_REGION_NOT_LIVE_REGION     0.0000
FLAG_EMP_PHONE                 0.0000
REG_REGION_NOT_WORK_REGION     0.0000
LIVE_REGION_NOT_WORK_REGION    0.0000
REG_CITY_NOT_LIVE_CITY         0.0000
REG_CITY_NOT_WORK_CITY         0.0000
LIVE_CITY_NOT_WORK_CITY        0.0000
ORGANIZATION_TYPE              0.0000
EXT_SOURCE_2                   0.0000
EXT_SOURCE_3                   0.0000
FLAG_WORK_PHONE                0.0000
SK_ID_CURR                     0.0000
TARGET                         0.0000
DAYS_ID_PUBLISH                0.0000
NAME_CONTRACT_TYPE             0.0000
CODE_GENDER                    0.0000
FLAG_OWN_CAR                   0.0000
FLAG_OWN_REALTY                0.0000
CNT_CHILDREN                   0.0000
AMT_INCOME_TOTAL               0.0000
AMT_CREDIT                     0.0000
AMT_GOODS_PRICE                0.0000
NAME_TYPE_SUITE                0.0000
NAME_INCOME_TYPE               0.0000
NAME_EDUCATION_TYPE            0.0000
NAME_FAMILY_STATUS             0.0000
NAME_HOUSING_TYPE              0.0000
REGION_POPULATION_RELATIVE     0.0000
DAYS_BIRTH                     0.0000
DAYS_EMPLOYED                  0.0000
DAYS_REGISTRATION              0.0000
FLAG_MOBIL                     0.0000
dtype: float64
In [45]:
#REMAINING COLUMNS WITH NEGLIGIBLE NULL VALUES (LESS THAN 1%)

null_col=['CNT_FAM_MEMBERS','AMT_ANNUITY', 'DAYS_LAST_PHONE_CHANGE']
for column in null_col:
    application_data_up[column].fillna(application_data_up[column].median(), inplace=True)
In [46]:
round(application_data_up.isnull().sum()/application_data_up.shape[0]*100,4).sort_values(ascending=False)
Out[46]:
SK_ID_CURR                     0.0
HOUR_APPR_PROCESS_START        0.0
FLAG_CONT_MOBILE               0.0
FLAG_PHONE                     0.0
FLAG_EMAIL                     0.0
OCCUPATION_TYPE                0.0
CNT_FAM_MEMBERS                0.0
REGION_RATING_CLIENT           0.0
REGION_RATING_CLIENT_W_CITY    0.0
WEEKDAY_APPR_PROCESS_START     0.0
REG_REGION_NOT_LIVE_REGION     0.0
FLAG_EMP_PHONE                 0.0
REG_REGION_NOT_WORK_REGION     0.0
LIVE_REGION_NOT_WORK_REGION    0.0
REG_CITY_NOT_LIVE_CITY         0.0
REG_CITY_NOT_WORK_CITY         0.0
LIVE_CITY_NOT_WORK_CITY        0.0
ORGANIZATION_TYPE              0.0
EXT_SOURCE_2                   0.0
EXT_SOURCE_3                   0.0
FLAG_WORK_PHONE                0.0
FLAG_MOBIL                     0.0
TARGET                         0.0
AMT_GOODS_PRICE                0.0
NAME_CONTRACT_TYPE             0.0
CODE_GENDER                    0.0
FLAG_OWN_CAR                   0.0
FLAG_OWN_REALTY                0.0
CNT_CHILDREN                   0.0
AMT_INCOME_TOTAL               0.0
AMT_CREDIT                     0.0
AMT_ANNUITY                    0.0
NAME_TYPE_SUITE                0.0
DAYS_ID_PUBLISH                0.0
NAME_INCOME_TYPE               0.0
NAME_EDUCATION_TYPE            0.0
NAME_FAMILY_STATUS             0.0
NAME_HOUSING_TYPE              0.0
REGION_POPULATION_RELATIVE     0.0
DAYS_BIRTH                     0.0
DAYS_EMPLOYED                  0.0
DAYS_REGISTRATION              0.0
DAYS_LAST_PHONE_CHANGE         0.0
dtype: float64
In [47]:
application_data_up.info(verbose=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 43 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   SK_ID_CURR                   307511 non-null  int64  
 1   TARGET                       307511 non-null  int64  
 2   NAME_CONTRACT_TYPE           307511 non-null  object 
 3   CODE_GENDER                  307511 non-null  object 
 4   FLAG_OWN_CAR                 307511 non-null  object 
 5   FLAG_OWN_REALTY              307511 non-null  object 
 6   CNT_CHILDREN                 307511 non-null  int64  
 7   AMT_INCOME_TOTAL             307511 non-null  float64
 8   AMT_CREDIT                   307511 non-null  float64
 9   AMT_ANNUITY                  307511 non-null  float64
 10  AMT_GOODS_PRICE              307511 non-null  float64
 11  NAME_TYPE_SUITE              307511 non-null  object 
 12  NAME_INCOME_TYPE             307511 non-null  object 
 13  NAME_EDUCATION_TYPE          307511 non-null  object 
 14  NAME_FAMILY_STATUS           307511 non-null  object 
 15  NAME_HOUSING_TYPE            307511 non-null  object 
 16  REGION_POPULATION_RELATIVE   307511 non-null  float64
 17  DAYS_BIRTH                   307511 non-null  int64  
 18  DAYS_EMPLOYED                307511 non-null  int64  
 19  DAYS_REGISTRATION            307511 non-null  float64
 20  DAYS_ID_PUBLISH              307511 non-null  int64  
 21  FLAG_MOBIL                   307511 non-null  int64  
 22  FLAG_EMP_PHONE               307511 non-null  int64  
 23  FLAG_WORK_PHONE              307511 non-null  int64  
 24  FLAG_CONT_MOBILE             307511 non-null  int64  
 25  FLAG_PHONE                   307511 non-null  int64  
 26  FLAG_EMAIL                   307511 non-null  int64  
 27  OCCUPATION_TYPE              307511 non-null  object 
 28  CNT_FAM_MEMBERS              307511 non-null  float64
 29  REGION_RATING_CLIENT         307511 non-null  int64  
 30  REGION_RATING_CLIENT_W_CITY  307511 non-null  int64  
 31  WEEKDAY_APPR_PROCESS_START   307511 non-null  object 
 32  HOUR_APPR_PROCESS_START      307511 non-null  int64  
 33  REG_REGION_NOT_LIVE_REGION   307511 non-null  int64  
 34  REG_REGION_NOT_WORK_REGION   307511 non-null  int64  
 35  LIVE_REGION_NOT_WORK_REGION  307511 non-null  int64  
 36  REG_CITY_NOT_LIVE_CITY       307511 non-null  int64  
 37  REG_CITY_NOT_WORK_CITY       307511 non-null  int64  
 38  LIVE_CITY_NOT_WORK_CITY      307511 non-null  int64  
 39  ORGANIZATION_TYPE            307511 non-null  object 
 40  EXT_SOURCE_2                 307511 non-null  float64
 41  EXT_SOURCE_3                 307511 non-null  float64
 42  DAYS_LAST_PHONE_CHANGE       307511 non-null  float64
dtypes: float64(10), int64(21), object(12)
memory usage: 100.9+ MB
In [48]:
dayandcount=['CNT_FAM_MEMBERS','DAYS_REGISTRATION','DAYS_LAST_PHONE_CHANGE']
application_data_up.loc[:, dayandcount]=application_data_up.loc[:,dayandcount].apply(lambda x: x.astype('int64',errors='ignore'))
In [49]:
application_data_up.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 43 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   SK_ID_CURR                   307511 non-null  int64  
 1   TARGET                       307511 non-null  int64  
 2   NAME_CONTRACT_TYPE           307511 non-null  object 
 3   CODE_GENDER                  307511 non-null  object 
 4   FLAG_OWN_CAR                 307511 non-null  object 
 5   FLAG_OWN_REALTY              307511 non-null  object 
 6   CNT_CHILDREN                 307511 non-null  int64  
 7   AMT_INCOME_TOTAL             307511 non-null  float64
 8   AMT_CREDIT                   307511 non-null  float64
 9   AMT_ANNUITY                  307511 non-null  float64
 10  AMT_GOODS_PRICE              307511 non-null  float64
 11  NAME_TYPE_SUITE              307511 non-null  object 
 12  NAME_INCOME_TYPE             307511 non-null  object 
 13  NAME_EDUCATION_TYPE          307511 non-null  object 
 14  NAME_FAMILY_STATUS           307511 non-null  object 
 15  NAME_HOUSING_TYPE            307511 non-null  object 
 16  REGION_POPULATION_RELATIVE   307511 non-null  float64
 17  DAYS_BIRTH                   307511 non-null  int64  
 18  DAYS_EMPLOYED                307511 non-null  int64  
 19  DAYS_REGISTRATION            307511 non-null  int64  
 20  DAYS_ID_PUBLISH              307511 non-null  int64  
 21  FLAG_MOBIL                   307511 non-null  int64  
 22  FLAG_EMP_PHONE               307511 non-null  int64  
 23  FLAG_WORK_PHONE              307511 non-null  int64  
 24  FLAG_CONT_MOBILE             307511 non-null  int64  
 25  FLAG_PHONE                   307511 non-null  int64  
 26  FLAG_EMAIL                   307511 non-null  int64  
 27  OCCUPATION_TYPE              307511 non-null  object 
 28  CNT_FAM_MEMBERS              307511 non-null  int64  
 29  REGION_RATING_CLIENT         307511 non-null  int64  
 30  REGION_RATING_CLIENT_W_CITY  307511 non-null  int64  
 31  WEEKDAY_APPR_PROCESS_START   307511 non-null  object 
 32  HOUR_APPR_PROCESS_START      307511 non-null  int64  
 33  REG_REGION_NOT_LIVE_REGION   307511 non-null  int64  
 34  REG_REGION_NOT_WORK_REGION   307511 non-null  int64  
 35  LIVE_REGION_NOT_WORK_REGION  307511 non-null  int64  
 36  REG_CITY_NOT_LIVE_CITY       307511 non-null  int64  
 37  REG_CITY_NOT_WORK_CITY       307511 non-null  int64  
 38  LIVE_CITY_NOT_WORK_CITY      307511 non-null  int64  
 39  ORGANIZATION_TYPE            307511 non-null  object 
 40  EXT_SOURCE_2                 307511 non-null  float64
 41  EXT_SOURCE_3                 307511 non-null  float64
 42  DAYS_LAST_PHONE_CHANGE       307511 non-null  int64  
dtypes: float64(7), int64(24), object(12)
memory usage: 100.9+ MB

CAHNGING ALL VALUES OF COLUMNS WITH DATA TYPE OBJECT TO STRING¶

In [50]:
#LISTING OBJECT TYPE COLUMNS AND CONFIRMING THE VALUES TO BE IN STRING TYPE

obj_col=list(application_data_up.select_dtypes(include='object').columns)

application_data_up.loc[:,obj_col]=application_data_up.loc[:,obj_col].apply(lambda x: x.astype('str'))
In [51]:
application_data_up.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 43 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   SK_ID_CURR                   307511 non-null  int64  
 1   TARGET                       307511 non-null  int64  
 2   NAME_CONTRACT_TYPE           307511 non-null  object 
 3   CODE_GENDER                  307511 non-null  object 
 4   FLAG_OWN_CAR                 307511 non-null  object 
 5   FLAG_OWN_REALTY              307511 non-null  object 
 6   CNT_CHILDREN                 307511 non-null  int64  
 7   AMT_INCOME_TOTAL             307511 non-null  float64
 8   AMT_CREDIT                   307511 non-null  float64
 9   AMT_ANNUITY                  307511 non-null  float64
 10  AMT_GOODS_PRICE              307511 non-null  float64
 11  NAME_TYPE_SUITE              307511 non-null  object 
 12  NAME_INCOME_TYPE             307511 non-null  object 
 13  NAME_EDUCATION_TYPE          307511 non-null  object 
 14  NAME_FAMILY_STATUS           307511 non-null  object 
 15  NAME_HOUSING_TYPE            307511 non-null  object 
 16  REGION_POPULATION_RELATIVE   307511 non-null  float64
 17  DAYS_BIRTH                   307511 non-null  int64  
 18  DAYS_EMPLOYED                307511 non-null  int64  
 19  DAYS_REGISTRATION            307511 non-null  int64  
 20  DAYS_ID_PUBLISH              307511 non-null  int64  
 21  FLAG_MOBIL                   307511 non-null  int64  
 22  FLAG_EMP_PHONE               307511 non-null  int64  
 23  FLAG_WORK_PHONE              307511 non-null  int64  
 24  FLAG_CONT_MOBILE             307511 non-null  int64  
 25  FLAG_PHONE                   307511 non-null  int64  
 26  FLAG_EMAIL                   307511 non-null  int64  
 27  OCCUPATION_TYPE              307511 non-null  object 
 28  CNT_FAM_MEMBERS              307511 non-null  int64  
 29  REGION_RATING_CLIENT         307511 non-null  int64  
 30  REGION_RATING_CLIENT_W_CITY  307511 non-null  int64  
 31  WEEKDAY_APPR_PROCESS_START   307511 non-null  object 
 32  HOUR_APPR_PROCESS_START      307511 non-null  int64  
 33  REG_REGION_NOT_LIVE_REGION   307511 non-null  int64  
 34  REG_REGION_NOT_WORK_REGION   307511 non-null  int64  
 35  LIVE_REGION_NOT_WORK_REGION  307511 non-null  int64  
 36  REG_CITY_NOT_LIVE_CITY       307511 non-null  int64  
 37  REG_CITY_NOT_WORK_CITY       307511 non-null  int64  
 38  LIVE_CITY_NOT_WORK_CITY      307511 non-null  int64  
 39  ORGANIZATION_TYPE            307511 non-null  object 
 40  EXT_SOURCE_2                 307511 non-null  float64
 41  EXT_SOURCE_3                 307511 non-null  float64
 42  DAYS_LAST_PHONE_CHANGE       307511 non-null  int64  
dtypes: float64(7), int64(24), object(12)
memory usage: 100.9+ MB

CHECKING VALUES OF OTHER CATEGORICAL COLUMNS

CHECKING GENDER CODE COLUMN

In [52]:
# VALUE COUNTS OF GENDER CODE

application_data_up.CODE_GENDER.value_counts()
Out[52]:
F      202448
M      105059
XNA         4
Name: CODE_GENDER, dtype: int64
In [53]:
#CHECKING GENDER COLUMN

plt.figure(figsize=[20,10])
sns.barplot(x=application_data_up.CODE_GENDER, y=application_data_up.TARGET).set_title("GENDER VS TARGET",fontsize=30, 
                                                                                       color='Blue', pad=25)
plt.show()
In [54]:
round(application_data_up.ORGANIZATION_TYPE.value_counts()/application_data_up.shape[0]*100,2)
Out[54]:
Business Entity Type 3    22.11
XNA                       18.01
Self-employed             12.49
Other                      5.43
Medicine                   3.64
Business Entity Type 2     3.43
Government                 3.38
School                     2.89
Trade: type 7              2.55
Kindergarten               2.24
Construction               2.19
Business Entity Type 1     1.95
Transport: type 4          1.76
Trade: type 3              1.14
Industry: type 9           1.10
Industry: type 3           1.07
Security                   1.06
Housing                    0.96
Industry: type 11          0.88
Military                   0.86
Bank                       0.82
Agriculture                0.80
Police                     0.76
Transport: type 2          0.72
Postal                     0.70
Security Ministries        0.64
Trade: type 2              0.62
Restaurant                 0.59
Services                   0.51
University                 0.43
Industry: type 7           0.43
Transport: type 3          0.39
Industry: type 1           0.34
Hotel                      0.31
Electricity                0.31
Industry: type 4           0.29
Trade: type 6              0.21
Industry: type 5           0.19
Insurance                  0.19
Telecom                    0.19
Emergency                  0.18
Industry: type 2           0.15
Advertising                0.14
Realtor                    0.13
Culture                    0.12
Industry: type 12          0.12
Trade: type 1              0.11
Mobile                     0.10
Legal Services             0.10
Cleaning                   0.08
Transport: type 1          0.07
Industry: type 6           0.04
Industry: type 10          0.04
Religion                   0.03
Industry: type 13          0.02
Trade: type 4              0.02
Trade: type 5              0.02
Industry: type 8           0.01
Name: ORGANIZATION_TYPE, dtype: float64
In [55]:
#Therefore, we eliminate the sub-category with the overall category

application_data_up.ORGANIZATION_TYPE = application_data_up.ORGANIZATION_TYPE.apply(lambda x: 'Industry' if 'Industry' in x else x)
application_data_up.ORGANIZATION_TYPE = application_data_up.ORGANIZATION_TYPE.apply(lambda x: 'Trade' if 'Trade' in x else x)
application_data_up.ORGANIZATION_TYPE = application_data_up.ORGANIZATION_TYPE.apply(lambda x: 'Transport' if 'Transport' in x else x)
application_data_up.ORGANIZATION_TYPE = application_data_up.ORGANIZATION_TYPE.apply(lambda x: 'Business' if 'Business' in x else x)
In [56]:
round(application_data_up.ORGANIZATION_TYPE.value_counts()/application_data_up.shape[0]*100,2)
Out[56]:
Business               27.49
XNA                    18.01
Self-employed          12.49
Other                   5.43
Trade                   4.66
Industry                4.65
Medicine                3.64
Government              3.38
Transport               2.92
School                  2.89
Kindergarten            2.24
Construction            2.19
Security                1.06
Housing                 0.96
Military                0.86
Bank                    0.82
Agriculture             0.80
Police                  0.76
Postal                  0.70
Security Ministries     0.64
Restaurant              0.59
Services                0.51
University              0.43
Hotel                   0.31
Electricity             0.31
Insurance               0.19
Telecom                 0.19
Emergency               0.18
Advertising             0.14
Realtor                 0.13
Culture                 0.12
Mobile                  0.10
Legal Services          0.10
Cleaning                0.08
Religion                0.03
Name: ORGANIZATION_TYPE, dtype: float64
In [57]:
plt.figure(figsize=[30,10])

sns.barplot(x=application_data_up.ORGANIZATION_TYPE.value_counts().index,
            y=application_data_up.ORGANIZATION_TYPE.value_counts().values).set_title("Distribution within ORGANIZATION_TYPE",
                                                                            fontsize=20,color='Blue',pad=20)
    
plt.xlabel('ORGANIZATION_TYPE',fontsize=18,color='Black')
plt.xticks(rotation = 90)

plt.show()
In [58]:
application_data_up.NAME_CONTRACT_TYPE.value_counts()
Out[58]:
Cash loans         278232
Revolving loans     29279
Name: NAME_CONTRACT_TYPE, dtype: int64
In [59]:
application_data_up.FLAG_OWN_CAR.value_counts()
Out[59]:
N    202924
Y    104587
Name: FLAG_OWN_CAR, dtype: int64
In [60]:
application_data_up.FLAG_OWN_REALTY.value_counts()
Out[60]:
Y    213312
N     94199
Name: FLAG_OWN_REALTY, dtype: int64
In [61]:
application_data_up.NAME_TYPE_SUITE.value_counts()
Out[61]:
Unaccompanied      249818
Family              40149
Spouse, partner     11370
Children             3267
Other_B              1770
Other_A               866
Group of people       271
Name: NAME_TYPE_SUITE, dtype: int64
In [62]:
application_data_up.NAME_EDUCATION_TYPE.value_counts()
Out[62]:
Secondary / secondary special    218391
Higher education                  74863
Incomplete higher                 10277
Lower secondary                    3816
Academic degree                     164
Name: NAME_EDUCATION_TYPE, dtype: int64
In [63]:
application_data_up.NAME_INCOME_TYPE.value_counts()
Out[63]:
Working                 158774
Commercial associate     71617
Pensioner                55362
State servant            21703
Unemployed                  22
Student                     18
Businessman                 10
Maternity leave              5
Name: NAME_INCOME_TYPE, dtype: int64
In [64]:
application_data_up.NAME_FAMILY_STATUS.value_counts()
Out[64]:
Married                 196432
Single / not married     45444
Civil marriage           29775
Separated                19770
Widow                    16088
Unknown                      2
Name: NAME_FAMILY_STATUS, dtype: int64
In [65]:
application_data_up.NAME_HOUSING_TYPE.value_counts()
Out[65]:
House / apartment      272868
With parents            14840
Municipal apartment     11183
Rented apartment         4881
Office apartment         2617
Co-op apartment          1122
Name: NAME_HOUSING_TYPE, dtype: int64
In [66]:
application_data_up.WEEKDAY_APPR_PROCESS_START.value_counts()
Out[66]:
TUESDAY      53901
WEDNESDAY    51934
MONDAY       50714
THURSDAY     50591
FRIDAY       50338
SATURDAY     33852
SUNDAY       16181
Name: WEEKDAY_APPR_PROCESS_START, dtype: int64
In [67]:
application_data_up.DAYS_BIRTH.unique()
Out[67]:
array([ -9461, -16765, -19046, ...,  -7951,  -7857, -25061], dtype=int64)
In [68]:
application_data_up.DAYS_EMPLOYED.unique()
Out[68]:
array([  -637,  -1188,   -225, ..., -12971, -11084,  -8694], dtype=int64)
In [69]:
#CHECKING 'DAYS_REGISTRATION' COLUMN

application_data_up.DAYS_REGISTRATION.unique()
Out[69]:
array([ -3648,  -1186,  -4260, ..., -16396, -14558, -14798], dtype=int64)
In [70]:
#CHECKING 'DAYS_ID_PUBLISH' COLUMN

application_data_up.DAYS_ID_PUBLISH.unique()
Out[70]:
array([-2120,  -291, -2531, ..., -6194, -5854, -6211], dtype=int64)
In [71]:
#CHECKING 'DAYS_LAST_PHONE_CHANGE' COLUMN

application_data_up.DAYS_LAST_PHONE_CHANGE.unique()
Out[71]:
array([-1134,  -828,  -815, ..., -3988, -3899, -3538], dtype=int64)
In [72]:
(application_data_up["DAYS_EMPLOYED"]//365).value_counts().tail()
Out[72]:
-49    4
-47    4
 0     2
-48    1
-50    1
Name: DAYS_EMPLOYED, dtype: int64
In [73]:
#changing values to positive integer

num_days=['DAYS_BIRTH','DAYS_EMPLOYED','DAYS_REGISTRATION','DAYS_ID_PUBLISH','DAYS_LAST_PHONE_CHANGE']

for i in num_days:
    application_data_up[i]=abs(application_data_up[i])
In [74]:
application_data_up.DAYS_LAST_PHONE_CHANGE.unique()
Out[74]:
array([1134,  828,  815, ..., 3988, 3899, 3538], dtype=int64)
In [75]:
application_data_up.AMT_INCOME_TOTAL.quantile([0,0.1,0.15,0.2,0.25,0.3,0.4,0.5,0.6,0.7,0.75,0.8,0.9,0.95,0.99,0.999,1])
Out[75]:
0.000        25650.0
0.100        81000.0
0.150        90000.0
0.200        99000.0
0.250       112500.0
0.300       112500.0
0.400       135000.0
0.500       147150.0
0.600       162000.0
0.700       180000.0
0.750       202500.0
0.800       225000.0
0.900       270000.0
0.950       337500.0
0.990       472500.0
0.999       900000.0
1.000    117000000.0
Name: AMT_INCOME_TOTAL, dtype: float64
In [76]:
application_data_up['INCOME_SLAB']=pd.qcut(application_data_up['AMT_INCOME_TOTAL'], q=[0,0.2,0.5,0.75,0.95,1],
                                          labels=['VeryLow','Low','Medium','High','VeryHigh'])
In [77]:
application_data_up['INCOME_SLAB'].value_counts()
Out[77]:
Low         90089
Medium      82213
VeryLow     63671
High        57503
VeryHigh    14035
Name: INCOME_SLAB, dtype: int64
In [78]:
application_data_up['AMT_CREDIT_slab']=pd.qcut(application_data_up['AMT_CREDIT'], q=[0,0.2,0.5,0.75,0.95,1],
                                          labels=['VeryLow','Low','Medium','High','VeryHigh'])
In [79]:
application_data_up['AMT_CREDIT_slab'].value_counts()
Out[79]:
Low         88924
Medium      77786
VeryLow     64925
High        61842
VeryHigh    14034
Name: AMT_CREDIT_slab, dtype: int64
In [80]:
#BINNING OF DAYS_BIRTH / AGE COLUMN
#CONVERTING DAYS TO YEARS

application_data_up['AGE']=(application_data_up['DAYS_BIRTH']//365).astype('int64',errors='ignore')
In [81]:
#MAX & MIN VALUES

print(application_data_up['AGE'].max())
print(application_data_up['AGE'].min())
69
20
In [82]:
#CREATING 10 BINS

application_data_up['AGE_BINS']=pd.cut(application_data_up['AGE'],bins=np.arange(20,71,5))
In [83]:
application_data_up['AGE_BINS'].value_counts()
Out[83]:
(35, 40]    43680
(40, 45]    39997
(30, 35]    39437
(25, 30]    36488
(50, 55]    35097
(45, 50]    34404
(55, 60]    32722
(60, 65]    24359
(20, 25]    16317
(65, 70]     5009
Name: AGE_BINS, dtype: int64

OUTLIER ANALYSIS

In [84]:
#CHECKING AMT_INCOME_TOTAL_COLUMN
application_data_up.AMT_INCOME_TOTAL.quantile([0.9991,0.9992,0.9993])
Out[84]:
0.9991    9.025105e+05
0.9992    1.034820e+06
0.9993    1.125000e+06
Name: AMT_INCOME_TOTAL, dtype: float64
In [85]:
#Handling outliers for the AMT_INCOME_TOTAL by capping the income to 10Lakh: - Coping with threshold value

application_data_up['AMT_INCOME_TOTAL']= np.where(application_data_up['AMT_INCOME_TOTAL']
                                                  >1000000,1000000,application_data_up['AMT_INCOME_TOTAL'])
In [86]:
application_data_up['AMT_INCOME_TOTAL'].value_counts()
Out[86]:
135000.0    35750
112500.0    31019
157500.0    26556
180000.0    24719
90000.0     22483
            ...  
117324.0        1
64584.0         1
142897.5        1
109170.0        1
113062.5        1
Name: AMT_INCOME_TOTAL, Length: 2499, dtype: int64
In [87]:
application_data_up['AMT_INCOME_TOTAL'].max()
Out[87]:
1000000.0
In [88]:
plt.figure(figsize=[20,5])

plt.subplot(1,2,1)
sns.boxplot(application_data_up['AMT_INCOME_TOTAL']).set_title("AMT_INCOME_TOTAL - BOX PLOT",fontsize=30,color='blue',pad=30)

plt.subplot(1,2,2)
sns.distplot(application_data_up['AMT_INCOME_TOTAL'],color='blue').set_title("AMT_INCOME_TOTAL - DISTRIBUTION",
                                                                     fontsize=30,color='blue',pad=30)

plt.show()
In [89]:
application_data_up.shape
Out[89]:
(307511, 47)

application_data_up.drop('AMT-INCOME_TOTAL',axis=1,inplace=True)

In [90]:
application_data_up.shape
Out[90]:
(307511, 47)
In [91]:
Q3=np.quantile(application_data_up['AMT_INCOME_TOTAL'],0.75)
Q1=np.quantile(application_data_up['AMT_INCOME_TOTAL'],0.25)
IQR= Q3-Q1
lower_range=Q1-1.5*IQR
upper_range=Q3+1.5*IQR

#outliers=round(application_data_up.loc[(application_data_up['AMT_INCOME_TOTAL']> upper_range)].count()/new_credit.shape[0]*100)
#outliers
application_data_up.loc[(application_data_up['AMT_INCOME_TOTAL']>upper_range)].shape
Out[91]:
(14035, 47)
In [92]:
def remov_outliers(data,col):
    lower_range=0
    upper_range=0
    Q3=np.quantile(data[col],0.75)
    Q1=np.quantile(data[col],0.25)
    IQR=Q3-Q1
    lower_range=Q1-1.5*IQR
    upper_range=Q3-1.5*IQR
    return upper_range,lower_range
In [93]:
#impute outliers with median
application_data_up['AMT_INCOME_TOTAL']=np.where(application_data_up['AMT_INCOME_TOTAL']
                >upper_range,application_data_up['AMT_INCOME_TOTAL'].median(),application_data_up['AMT_INCOME_TOTAL'])
In [94]:
application_data_up.shape
Out[94]:
(307511, 47)
In [95]:
sns.boxplot(application_data_up['AMT_INCOME_TOTAL']).set_title("AMT_INCOME_TOTAL - BOX PLOT", fontsize=25, color='blue', pad=20)
plt.show()
In [96]:
application_data_up.shape
Out[96]:
(307511, 47)
In [97]:
plt.figure(figsize=[30,10])

plt.subplot(1,2,1)
sns.boxplot(application_data_up['AMT_INCOME_TOTAL']).set_title("AMT_INCOME_TOTAL - BOX PLOT", fontsize=25, color='blue',pad=30)

plt.subplot(1,2,2)
sns.distplot(application_data_up['AMT_INCOME_TOTAL'],color='blue').set_title("AMT_INCOME_TOTAL - DISTRIBUTION",
                                                                            fontsize=25, color='blue', pad=25)
plt.show()
In [98]:
plt.figure(figsize=[20,5])

plt.subplot(1,2,1)
sns.boxplot(application_data_up['AMT_ANNUITY']).set_title("AMT_ANNUITY - BOX PLOT",fontsize=25,color='blue',pad=20)

plt.subplot(1,2,2)
sns.distplot(application_data_up['AMT_ANNUITY'],color='blue').set_title("AMT_ANNUITY - DISTRIBUTION",
                                                                     fontsize=25,color='indigo',pad=20)

plt.show()
In [99]:
application_data_up.shape
Out[99]:
(307511, 47)
In [100]:
plt.figure(figsize=[30,10])

plt.subplot(1,2,1)
sns.boxplot(application_data_up['AMT_ANNUITY']).set_title("AMT_ANNUITY - BOX PLOT",fontsize=15,color='blue',pad=20)

plt.subplot(1,2,2)
sns.distplot(application_data_up['AMT_ANNUITY'],color='blue').set_title("AMT_ANNUITY - DISTRIBUTION",
                                                                     fontsize=15,color='blue',pad=20)
plt.show()
In [101]:
plt.figure(figsize=[30,10])

plt.subplot(1,2,1)
sns.boxplot(application_data_up['AGE']).set_title("AGE - BOXPLOT",fontsize=25,color='blue',pad=30)

plt.subplot(1,2,2)
sns.distplot(application_data_up['AGE'],color='blue').set_title("AGE - DISTRIBUTION",
                                                                     fontsize=25,color='blue',pad=30)
plt.show()
In [102]:
plt.figure(figsize=[20,5])

plt.subplot(1,2,1)
sns.boxplot(application_data_up['DAYS_EMPLOYED']).set_title("DAYS_EMPLOYED - BOX PLOT",fontsize=25,color='blue',pad=30)

plt.subplot(1,2,2)
sns.distplot(application_data_up['DAYS_EMPLOYED'],color='blue').set_title("DAYS_EMPLOYED - DISTRIBUTION",
                                                                     fontsize=25,color='blue',pad=30)

plt.show()
In [103]:
application_data_up['DAYS_EMPLOYED']=np.where(application_data_up['DAYS_EMPLOYED']
                        >=100000,application_data_up['DAYS_EMPLOYED'].median(),application_data_up['DAYS_EMPLOYED'])
In [104]:
application_data_up['DAYS_EMPLOYED']=np.where(application_data_up['DAYS_EMPLOYED']//365>=80,application_data_up['DAYS_EMPLOYED'].median(),application_data_up['DAYS_EMPLOYED'])
In [105]:
application_data_up.shape
Out[105]:
(307511, 47)
In [106]:
plt.figure(figsize=[20,5])

plt.subplot(1,2,1)
sns.boxplot(application_data_up['DAYS_EMPLOYED']).set_title("DAYS_EMPLOYED - BOX PLOT",fontsize=25,color='blue',pad=30)

plt.subplot(1,2,2)
sns.distplot(application_data_up['DAYS_EMPLOYED'],color='blue').set_title("DAYS_EMPLOYED - DISTRIBUTION",
                                                                     fontsize=25,color='blue',pad=30)

plt.show()
In [107]:
#CHECKING DAYS_REGISTRATION COLUMN
plt.figure(figsize=[20,5])

plt.subplot(1,2,1)
sns.boxplot(application_data_up['DAYS_REGISTRATION']).set_title("DAYS_REGISTRATION - BOX PLOT",fontsize=25,color='blue',pad=30)

plt.subplot(1,2,2)
sns.distplot(application_data_up['DAYS_REGISTRATION'],color='blue').set_title("DAYS_REGISTRATION - DISTRIBUTION",
                                                                     fontsize=25,color='blue',pad=30)

plt.show()
In [108]:
#CREATING ADDITIONAL COLUMNS FOR ANALYSIS
#CREATING A COLUMN WITH VALUES CREDIT TO INCOME RATIO - Derived Metrics

application_data_up['CREDIT_RATIO']=(application_data_up.AMT_CREDIT // application_data_up.AMT_INCOME_TOTAL).astype('int64')
In [109]:
application_data_up.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 48 columns):
 #   Column                       Non-Null Count   Dtype   
---  ------                       --------------   -----   
 0   SK_ID_CURR                   307511 non-null  int64   
 1   TARGET                       307511 non-null  int64   
 2   NAME_CONTRACT_TYPE           307511 non-null  object  
 3   CODE_GENDER                  307511 non-null  object  
 4   FLAG_OWN_CAR                 307511 non-null  object  
 5   FLAG_OWN_REALTY              307511 non-null  object  
 6   CNT_CHILDREN                 307511 non-null  int64   
 7   AMT_INCOME_TOTAL             307511 non-null  float64 
 8   AMT_CREDIT                   307511 non-null  float64 
 9   AMT_ANNUITY                  307511 non-null  float64 
 10  AMT_GOODS_PRICE              307511 non-null  float64 
 11  NAME_TYPE_SUITE              307511 non-null  object  
 12  NAME_INCOME_TYPE             307511 non-null  object  
 13  NAME_EDUCATION_TYPE          307511 non-null  object  
 14  NAME_FAMILY_STATUS           307511 non-null  object  
 15  NAME_HOUSING_TYPE            307511 non-null  object  
 16  REGION_POPULATION_RELATIVE   307511 non-null  float64 
 17  DAYS_BIRTH                   307511 non-null  int64   
 18  DAYS_EMPLOYED                307511 non-null  float64 
 19  DAYS_REGISTRATION            307511 non-null  int64   
 20  DAYS_ID_PUBLISH              307511 non-null  int64   
 21  FLAG_MOBIL                   307511 non-null  int64   
 22  FLAG_EMP_PHONE               307511 non-null  int64   
 23  FLAG_WORK_PHONE              307511 non-null  int64   
 24  FLAG_CONT_MOBILE             307511 non-null  int64   
 25  FLAG_PHONE                   307511 non-null  int64   
 26  FLAG_EMAIL                   307511 non-null  int64   
 27  OCCUPATION_TYPE              307511 non-null  object  
 28  CNT_FAM_MEMBERS              307511 non-null  int64   
 29  REGION_RATING_CLIENT         307511 non-null  int64   
 30  REGION_RATING_CLIENT_W_CITY  307511 non-null  int64   
 31  WEEKDAY_APPR_PROCESS_START   307511 non-null  object  
 32  HOUR_APPR_PROCESS_START      307511 non-null  int64   
 33  REG_REGION_NOT_LIVE_REGION   307511 non-null  int64   
 34  REG_REGION_NOT_WORK_REGION   307511 non-null  int64   
 35  LIVE_REGION_NOT_WORK_REGION  307511 non-null  int64   
 36  REG_CITY_NOT_LIVE_CITY       307511 non-null  int64   
 37  REG_CITY_NOT_WORK_CITY       307511 non-null  int64   
 38  LIVE_CITY_NOT_WORK_CITY      307511 non-null  int64   
 39  ORGANIZATION_TYPE            307511 non-null  object  
 40  EXT_SOURCE_2                 307511 non-null  float64 
 41  EXT_SOURCE_3                 307511 non-null  float64 
 42  DAYS_LAST_PHONE_CHANGE       307511 non-null  int64   
 43  INCOME_SLAB                  307511 non-null  category
 44  AMT_CREDIT_slab              307511 non-null  category
 45  AGE                          307511 non-null  int64   
 46  AGE_BINS                     307510 non-null  category
 47  CREDIT_RATIO                 307511 non-null  int64   
dtypes: category(3), float64(8), int64(25), object(12)
memory usage: 106.5+ MB
In [110]:
##CHECKING TARGET IMBALANCE

go.Figure(data=[go.Pie(labels=application_data_up.TARGET.value_counts(normalize=True).index,
                       values=application_data_up.TARGET.value_counts(normalize=True).values, hole=.5,
                      title='Defaulter Vs Non-Defaulters')])
In [111]:
##CHECKING GENDER IMBALANCE

go.Figure(data=[go.Pie(labels=application_data_up.CODE_GENDER.value_counts(normalize=True).index,
                       values=application_data_up.CODE_GENDER.value_counts(normalize=True).values, hole=.5,
                      title='Male Vs Female')])
In [112]:
plt.figure(figsize=[12,12])

f=sns.heatmap(application_data_up[['AMT_CREDIT','AMT_GOODS_PRICE','AMT_ANNUITY','AMT_INCOME_TOTAL','AGE','DAYS_BIRTH',
                          'DAYS_EMPLOYED','REGION_RATING_CLIENT','REGION_RATING_CLIENT_W_CITY']].corr(),
              cmap="Blues",annot=True).set_title('Correlation between variables',fontsize=20,color='blue')

plt.show()
In [113]:
plt.figure(figsize=[12,12])

f=sns.heatmap(application_data_up[['AGE','FLAG_EMP_PHONE','DAYS_REGISTRATION','DAYS_ID_PUBLISH','REGION_RATING_CLIENT',
                          'REGION_RATING_CLIENT_W_CITY','REG_CITY_NOT_LIVE_CITY','REG_CITY_NOT_WORK_CITY',
                           'LIVE_CITY_NOT_WORK_CITY']].corr(),
              cmap="Oranges",annot=True).set_title('Correlation between variables',fontsize=15,color='NAvy')

plt.show()
In [114]:
application_data_up['REG_CITY_NOT_LIVE_CITY'].value_counts()
Out[114]:
0    283472
1     24039
Name: REG_CITY_NOT_LIVE_CITY, dtype: int64
In [115]:
application_data_up['AGE_BINS'].value_counts()
Out[115]:
(35, 40]    43680
(40, 45]    39997
(30, 35]    39437
(25, 30]    36488
(50, 55]    35097
(45, 50]    34404
(55, 60]    32722
(60, 65]    24359
(20, 25]    16317
(65, 70]     5009
Name: AGE_BINS, dtype: int64
In [116]:
#Creating two datasets for target=1 and target=0 (1=bad,0=good)

target1=application_data_up[application_data_up['TARGET']==1]
target0=application_data_up[application_data_up['TARGET']==0]

print(target1.shape,target0.shape,application_data_up.shape)
(24825, 48) (282686, 48) (307511, 48)
In [117]:
print("The dataset with Target value 1 has :" + "{:.2%}".format(target1.shape[0]/application_data_up.shape[0])+ "data.")
print("The dataset with Target value 0 has :" + "{:.2%}".format(target0.shape[0]/application_data_up.shape[0])+ "data.")
The dataset with Target value 1 has :8.07%data.
The dataset with Target value 0 has :91.93%data.
In [118]:
#CHECKING THE GENDER COLUMNS

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
sns.barplot(x=target1.CODE_GENDER.value_counts(normalize=True).index,
          y=target1.CODE_GENDER.value_counts(normalize=True).values, 
           palette="rocket").set_title("Gender(Defulter)%" , fontsize=30, color="blue")

plt.subplot(1,2,2)
sns.barplot(x=target0.CODE_GENDER.value_counts(normalize=True).index,
           y=target0.CODE_GENDER.value_counts(normalize=True).values,
           palette="rocket").set_title("Gender(Non-Defulter)%" , fontsize=30, color="blue")
plt.show()
In [119]:
#checking NAME_CONTACT_TYPE

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.NAME_CONTRACT_TYPE.value_counts(normalize=True).plot.barh(color="blue").set_title("NAME_CONTRACT_TYPE(DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.NAME_CONTRACT_TYPE.value_counts(normalize=True).plot.barh(color="red").set_title("NAME_CONTRACT_TYPE(NON-DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.show()
In [120]:
#Owing car vs Defaulting

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.FLAG_OWN_CAR.value_counts(normalize=True).plot.barh(color="blue").set_title("FLAG_OWN_CAR(DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.FLAG_OWN_CAR.value_counts(normalize=True).plot.barh(color="red").set_title("FLAG_OWN_CAR(NON-DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.show()
In [121]:
#Owing a House/Estate etc vs Defaulting

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.FLAG_OWN_REALTY.value_counts(normalize=True).plot.barh(color="blue").set_title("FLAG_OWN_REALTY(DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.FLAG_OWN_REALTY.value_counts(normalize=True).plot.barh(color="red").set_title("FLAG_OWN_REALTY(NON-DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.show()
In [122]:
#NAME_TYPE_SUITE vs Defaulting

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.NAME_TYPE_SUITE.value_counts(normalize=True).plot.barh(color="blue").set_title("NAME_TYPE_SUITE(DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.NAME_TYPE_SUITE.value_counts(normalize=True).plot.barh(color="red").set_title("NAME_TYPE_SUITE(NON-DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.show()
In [123]:
#NAME_INCOME_TYPE vs Defaulting

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.NAME_INCOME_TYPE.value_counts(normalize=True).plot.barh(color="blue").set_title("NAME_INCOME_TYPE(DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.NAME_INCOME_TYPE.value_counts(normalize=True).plot.barh(color="red").set_title("NAME_INCOME_TYPE(NON-DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.show()
In [124]:
#Education vs Defaulting

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.NAME_EDUCATION_TYPE.value_counts(normalize=True).plot.barh(color="blue").set_title("NAME_EDUCATION_TYPE(DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.NAME_EDUCATION_TYPE.value_counts(normalize=True).plot.barh(color="red").set_title("NAME_EDUCATION_TYPE(NON-DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.show()
In [125]:
#Housing vs Defaulting

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.NAME_HOUSING_TYPE.value_counts(normalize=True).plot.barh(color="blue").set_title("NAME_HOUSING_TYPE(DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.NAME_HOUSING_TYPE.value_counts(normalize=True).plot.barh(color="red").set_title("NAME_HOUSING_TYPE(NON-DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.show()
In [126]:
#Marital status vs Defaulting

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.NAME_FAMILY_STATUS.value_counts(normalize=True).plot.barh(color="blue").set_title("NAME_FAMILY_STATUS(DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.NAME_FAMILY_STATUS.value_counts(normalize=True).plot.barh(color="red").set_title("NAME_FAMILY_STATUS(NON-DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.show()
In [127]:
#Occupations vs Defaulting

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.OCCUPATION_TYPE.value_counts(normalize=True).plot.barh(color="blue").set_title("OCCUPATION_TYPE(DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.OCCUPATION_TYPE.value_counts(normalize=True).plot.barh(color="red").set_title("OCCUPATION_TYPE(NON-DEFAULTER)%",
                                                                                       fontsize=20,color="black", pad=30)
plt.show()
In [128]:
#Day of the week vs Defaulting

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.WEEKDAY_APPR_PROCESS_START.value_counts(normalize=True).plot.barh(color="blue").set_title("WEEKDAY_APPR_PROCESS_START(DEFAULTER)%",
                                                                                       fontsize=10,color="black", pad=30)
plt.subplot(1,2,2)
target0.WEEKDAY_APPR_PROCESS_START.value_counts(normalize=True).plot.barh(color="red").set_title("WEEKDAY_APPR_PROCESS_START(NON-DEFAULTER)%",
                                                                                       fontsize=10,color="black", pad=30)
plt.show()
In [129]:
#Organization type vs Defaulting

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.ORGANIZATION_TYPE.value_counts(normalize=True).plot.barh(color="blue").set_title("ORGANIZATION_TYPE(DEFAULTER)%",
                                                                                       fontsize=10,color="black", pad=30)
plt.subplot(1,2,2)
target0.ORGANIZATION_TYPE.value_counts(normalize=True).plot.barh(color="red").set_title("ORGANIZATION_TYPE(NON-DEFAULTER)%",
                                                                                       fontsize=10,color="black", pad=30)
plt.show()
In [130]:
#Income vs Defaulting

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.INCOME_SLAB.value_counts(normalize=True).plot.barh(color="blue").set_title("INCOME_SLAB(DEFAULTER)%",
                                                                                       fontsize=10,color="black", pad=30)
plt.subplot(1,2,2)
target0.INCOME_SLAB.value_counts(normalize=True).plot.barh(color="red").set_title("INCOME_SLAB(NON-DEFAULTER)%",
                                                                                       fontsize=10,color="black", pad=30)
plt.show()
In [131]:
#Loan amount vs Defaulting

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.AMT_CREDIT_slab.value_counts(normalize=True).plot.barh(color="blue").set_title("AMT_CREDIT_slab(DEFAULTER)%",
                                                                                       fontsize=10,color="black", pad=30)
plt.subplot(1,2,2)
target0.AMT_CREDIT_slab.value_counts(normalize=True).plot.barh(color="red").set_title("AMT_CREDIT_slab(NON-DEFAULTER)%",
                                                                                       fontsize=10,color="black", pad=30)
plt.show()
In [132]:
#Age groups vs Defaulting

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
target1.AGE_BINS.value_counts(normalize=True).plot.barh(color="blue").set_title("AGE_BINS(DEFAULTER)%",
                                                                                       fontsize=10,color="black", pad=30)
plt.subplot(1,2,2)
target0.AGE_BINS.value_counts(normalize=True).plot.barh(color="red").set_title("AGE_BINS(NON-DEFAULTER)%",
                                                                                       fontsize=10,color="black", pad=30)
plt.show()
In [133]:
target0.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 282686 entries, 1 to 307510
Data columns (total 48 columns):
 #   Column                       Non-Null Count   Dtype   
---  ------                       --------------   -----   
 0   SK_ID_CURR                   282686 non-null  int64   
 1   TARGET                       282686 non-null  int64   
 2   NAME_CONTRACT_TYPE           282686 non-null  object  
 3   CODE_GENDER                  282686 non-null  object  
 4   FLAG_OWN_CAR                 282686 non-null  object  
 5   FLAG_OWN_REALTY              282686 non-null  object  
 6   CNT_CHILDREN                 282686 non-null  int64   
 7   AMT_INCOME_TOTAL             282686 non-null  float64 
 8   AMT_CREDIT                   282686 non-null  float64 
 9   AMT_ANNUITY                  282686 non-null  float64 
 10  AMT_GOODS_PRICE              282686 non-null  float64 
 11  NAME_TYPE_SUITE              282686 non-null  object  
 12  NAME_INCOME_TYPE             282686 non-null  object  
 13  NAME_EDUCATION_TYPE          282686 non-null  object  
 14  NAME_FAMILY_STATUS           282686 non-null  object  
 15  NAME_HOUSING_TYPE            282686 non-null  object  
 16  REGION_POPULATION_RELATIVE   282686 non-null  float64 
 17  DAYS_BIRTH                   282686 non-null  int64   
 18  DAYS_EMPLOYED                282686 non-null  float64 
 19  DAYS_REGISTRATION            282686 non-null  int64   
 20  DAYS_ID_PUBLISH              282686 non-null  int64   
 21  FLAG_MOBIL                   282686 non-null  int64   
 22  FLAG_EMP_PHONE               282686 non-null  int64   
 23  FLAG_WORK_PHONE              282686 non-null  int64   
 24  FLAG_CONT_MOBILE             282686 non-null  int64   
 25  FLAG_PHONE                   282686 non-null  int64   
 26  FLAG_EMAIL                   282686 non-null  int64   
 27  OCCUPATION_TYPE              282686 non-null  object  
 28  CNT_FAM_MEMBERS              282686 non-null  int64   
 29  REGION_RATING_CLIENT         282686 non-null  int64   
 30  REGION_RATING_CLIENT_W_CITY  282686 non-null  int64   
 31  WEEKDAY_APPR_PROCESS_START   282686 non-null  object  
 32  HOUR_APPR_PROCESS_START      282686 non-null  int64   
 33  REG_REGION_NOT_LIVE_REGION   282686 non-null  int64   
 34  REG_REGION_NOT_WORK_REGION   282686 non-null  int64   
 35  LIVE_REGION_NOT_WORK_REGION  282686 non-null  int64   
 36  REG_CITY_NOT_LIVE_CITY       282686 non-null  int64   
 37  REG_CITY_NOT_WORK_CITY       282686 non-null  int64   
 38  LIVE_CITY_NOT_WORK_CITY      282686 non-null  int64   
 39  ORGANIZATION_TYPE            282686 non-null  object  
 40  EXT_SOURCE_2                 282686 non-null  float64 
 41  EXT_SOURCE_3                 282686 non-null  float64 
 42  DAYS_LAST_PHONE_CHANGE       282686 non-null  int64   
 43  INCOME_SLAB                  282686 non-null  category
 44  AMT_CREDIT_slab              282686 non-null  category
 45  AGE                          282686 non-null  int64   
 46  AGE_BINS                     282685 non-null  category
 47  CREDIT_RATIO                 282686 non-null  int64   
dtypes: category(3), float64(8), int64(25), object(12)
memory usage: 100.0+ MB
In [134]:
target0.shape
Out[134]:
(282686, 48)

WORLING WITH PREVIOUS APPLICATION

In [135]:
previous_application.head()
Out[135]:
SK_ID_PREV SK_ID_CURR NAME_CONTRACT_TYPE AMT_ANNUITY AMT_APPLICATION AMT_CREDIT AMT_DOWN_PAYMENT AMT_GOODS_PRICE WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START FLAG_LAST_APPL_PER_CONTRACT NFLAG_LAST_APPL_IN_DAY RATE_DOWN_PAYMENT RATE_INTEREST_PRIMARY RATE_INTEREST_PRIVILEGED NAME_CASH_LOAN_PURPOSE NAME_CONTRACT_STATUS DAYS_DECISION NAME_PAYMENT_TYPE CODE_REJECT_REASON NAME_TYPE_SUITE NAME_CLIENT_TYPE NAME_GOODS_CATEGORY NAME_PORTFOLIO NAME_PRODUCT_TYPE CHANNEL_TYPE SELLERPLACE_AREA NAME_SELLER_INDUSTRY CNT_PAYMENT NAME_YIELD_GROUP PRODUCT_COMBINATION DAYS_FIRST_DRAWING DAYS_FIRST_DUE DAYS_LAST_DUE_1ST_VERSION DAYS_LAST_DUE DAYS_TERMINATION NFLAG_INSURED_ON_APPROVAL
0 2030495 271877 Consumer loans 1730.430 17145.0 17145.0 0.0 17145.0 SATURDAY 15 Y 1 0.0 0.182832 0.867336 XAP Approved -73 Cash through the bank XAP NaN Repeater Mobile POS XNA Country-wide 35 Connectivity 12.0 middle POS mobile with interest 365243.0 -42.0 300.0 -42.0 -37.0 0.0
1 2802425 108129 Cash loans 25188.615 607500.0 679671.0 NaN 607500.0 THURSDAY 11 Y 1 NaN NaN NaN XNA Approved -164 XNA XAP Unaccompanied Repeater XNA Cash x-sell Contact center -1 XNA 36.0 low_action Cash X-Sell: low 365243.0 -134.0 916.0 365243.0 365243.0 1.0
2 2523466 122040 Cash loans 15060.735 112500.0 136444.5 NaN 112500.0 TUESDAY 11 Y 1 NaN NaN NaN XNA Approved -301 Cash through the bank XAP Spouse, partner Repeater XNA Cash x-sell Credit and cash offices -1 XNA 12.0 high Cash X-Sell: high 365243.0 -271.0 59.0 365243.0 365243.0 1.0
3 2819243 176158 Cash loans 47041.335 450000.0 470790.0 NaN 450000.0 MONDAY 7 Y 1 NaN NaN NaN XNA Approved -512 Cash through the bank XAP NaN Repeater XNA Cash x-sell Credit and cash offices -1 XNA 12.0 middle Cash X-Sell: middle 365243.0 -482.0 -152.0 -182.0 -177.0 1.0
4 1784265 202054 Cash loans 31924.395 337500.0 404055.0 NaN 337500.0 THURSDAY 9 Y 1 NaN NaN NaN Repairs Refused -781 Cash through the bank HC NaN Repeater XNA Cash walk-in Credit and cash offices -1 XNA 24.0 high Cash Street: high NaN NaN NaN NaN NaN NaN
In [136]:
previous_application.shape
Out[136]:
(1670214, 37)
In [137]:
previous_application.isnull().sum()
Out[137]:
SK_ID_PREV                           0
SK_ID_CURR                           0
NAME_CONTRACT_TYPE                   0
AMT_ANNUITY                     372235
AMT_APPLICATION                      0
AMT_CREDIT                           1
AMT_DOWN_PAYMENT                895844
AMT_GOODS_PRICE                 385515
WEEKDAY_APPR_PROCESS_START           0
HOUR_APPR_PROCESS_START              0
FLAG_LAST_APPL_PER_CONTRACT          0
NFLAG_LAST_APPL_IN_DAY               0
RATE_DOWN_PAYMENT               895844
RATE_INTEREST_PRIMARY          1664263
RATE_INTEREST_PRIVILEGED       1664263
NAME_CASH_LOAN_PURPOSE               0
NAME_CONTRACT_STATUS                 0
DAYS_DECISION                        0
NAME_PAYMENT_TYPE                    0
CODE_REJECT_REASON                   0
NAME_TYPE_SUITE                 820405
NAME_CLIENT_TYPE                     0
NAME_GOODS_CATEGORY                  0
NAME_PORTFOLIO                       0
NAME_PRODUCT_TYPE                    0
CHANNEL_TYPE                         0
SELLERPLACE_AREA                     0
NAME_SELLER_INDUSTRY                 0
CNT_PAYMENT                     372230
NAME_YIELD_GROUP                     0
PRODUCT_COMBINATION                346
DAYS_FIRST_DRAWING              673065
DAYS_FIRST_DUE                  673065
DAYS_LAST_DUE_1ST_VERSION       673065
DAYS_LAST_DUE                   673065
DAYS_TERMINATION                673065
NFLAG_INSURED_ON_APPROVAL       673065
dtype: int64
In [138]:
req_columns=['SK_ID_CURR','AMT_APPLICATION','NAME_CASH_LOAN_PURPOSE','NAME_CONTRACT_STATUS','DAYS_DECISION','NAME_PAYMENT_TYPE',
            'CODE_REJECT_REASON','NAME_CLIENT_TYPE','NAME_GOODS_CATEGORY','NAME_PORTFOLIO','NAME_PRODUCT_TYPE','CHANNEL_TYPE',
            'NAME_YIELD_GROUP']
previous_application=previous_application.loc[:,req_columns]
In [139]:
previous_application.shape
Out[139]:
(1670214, 13)
In [140]:
#checking for any missing values

previous_application.isnull().sum()
Out[140]:
SK_ID_CURR                0
AMT_APPLICATION           0
NAME_CASH_LOAN_PURPOSE    0
NAME_CONTRACT_STATUS      0
DAYS_DECISION             0
NAME_PAYMENT_TYPE         0
CODE_REJECT_REASON        0
NAME_CLIENT_TYPE          0
NAME_GOODS_CATEGORY       0
NAME_PORTFOLIO            0
NAME_PRODUCT_TYPE         0
CHANNEL_TYPE              0
NAME_YIELD_GROUP          0
dtype: int64
In [141]:
previous_application.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1670214 entries, 0 to 1670213
Data columns (total 13 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   SK_ID_CURR              1670214 non-null  int64  
 1   AMT_APPLICATION         1670214 non-null  float64
 2   NAME_CASH_LOAN_PURPOSE  1670214 non-null  object 
 3   NAME_CONTRACT_STATUS    1670214 non-null  object 
 4   DAYS_DECISION           1670214 non-null  int64  
 5   NAME_PAYMENT_TYPE       1670214 non-null  object 
 6   CODE_REJECT_REASON      1670214 non-null  object 
 7   NAME_CLIENT_TYPE        1670214 non-null  object 
 8   NAME_GOODS_CATEGORY     1670214 non-null  object 
 9   NAME_PORTFOLIO          1670214 non-null  object 
 10  NAME_PRODUCT_TYPE       1670214 non-null  object 
 11  CHANNEL_TYPE            1670214 non-null  object 
 12  NAME_YIELD_GROUP        1670214 non-null  object 
dtypes: float64(1), int64(2), object(10)
memory usage: 165.7+ MB
In [142]:
previous_application.DAYS_DECISION.unique()
Out[142]:
array([  -73,  -164,  -301, ..., -1967, -2389,    -1], dtype=int64)
In [143]:
#changing values to positive integer

previous_application['DAYS_DECISION']=abs(previous_application['DAYS_DECISION'])
In [144]:
previous_application.NAME_PAYMENT_TYPE.value_counts()
Out[144]:
Cash through the bank                        1033552
XNA                                           627384
Non-cash from your account                      8193
Cashless from the account of the employer       1085
Name: NAME_PAYMENT_TYPE, dtype: int64
In [145]:
previous_application.NAME_CLIENT_TYPE.value_counts()
Out[145]:
Repeater     1231261
New           301363
Refreshed     135649
XNA             1941
Name: NAME_CLIENT_TYPE, dtype: int64
In [146]:
previous_application.NAME_PORTFOLIO.value_counts()
Out[146]:
POS      691011
Cash     461563
XNA      372230
Cards    144985
Cars        425
Name: NAME_PORTFOLIO, dtype: int64
In [147]:
#CHANGING XNA TO UNKNOWN

XNA_col=['NAME_PAYMENT_TYPE','NAME_CLIENT_TYPE','NAME_PORTFOLIO']

for i in XNA_col:
    previous_application[i]=previous_application[i].str.replace('XMA','unknown')

Univariate analysis ont the previous_application columns

In [148]:
#plotting graph for amt_application

plt.figure(figsize=[20,6])

plt.subplot(1,2,1)
sns.boxplot(previous_application['AMT_APPLICATION']).set_title("AMT_APPLICATION - BOXPLOT", fontsize=20, color='blue', pad=20)
plt.subplot(1,2,2)
sns.distplot(previous_application['AMT_APPLICATION'],color="blue").set_title("AMT_APPLICATION - DISTRIBUTION",
                                                                             fontsize=20, color='red', pad=20)

plt.show()
In [149]:
#plotting graph for DAYS_DECISION

plt.figure(figsize=[20,6])

plt.subplot(1,2,1)
sns.boxplot(previous_application['DAYS_DECISION']).set_title("DAYS_DECISION - BOX PLOT",fontsize=20,color='blue',pad=20)

plt.subplot(1,2,2)
sns.distplot(previous_application['DAYS_DECISION'],color='red').set_title("DAYS_DECISION - DISTRIBUTION",
                                                                     fontsize=20,color='blue',pad=20)

plt.show()
In [150]:
#reason for load
plt.figure(figsize=[20,10])

previous_application.NAME_CASH_LOAN_PURPOSE.value_counts(normalize=True).plot.barh(color='blue').set_title("NAME_CASH_LOAN_PURPOSE",
                                                                     fontsize=20,color='blue',pad=20)
plt.show()
In [151]:
#since most of the values are missing,this column is not useful for our analysis

previous_application.drop(columns=['NAME_CASH_LOAN_PURPOSE'],inplace=True)
In [152]:
previous_application.shape
Out[152]:
(1670214, 12)
In [153]:
#Payment method for loan

plt.figure(figsize=[20,10])

previous_application.NAME_PAYMENT_TYPE.value_counts(normalize=True).plot.barh(color="blue").set_title("Payment method for loan",
                                                                                       fontsize=15,color="black", pad=30)
plt.show()
In [154]:
#kind of goods the client applied for in the previous loan

plt.figure(figsize=[20,10])
previous_application.NAME_GOODS_CATEGORY.value_counts(normalize=True).plot.barh(color="blue").set_title("kind of goods the client applied for in the previous loan",
                                                                                       fontsize=15,color="black", pad=30)
plt.show()
In [155]:
previous_application.drop(columns=['NAME_GOODS_CATEGORY'],inplace=True)  #this
previous_application.shape
Out[155]:
(1670214, 11)
In [156]:
#was the prev app for cash,pos,car

plt.figure(figsize=[20,10])


previous_application.NAME_PORTFOLIO.value_counts(normalize=True).plot.barh(color="Brown").set_title("REASON FOR PREV APP",
                                                                                       fontsize=15,color="black", pad=20)
plt.show()
In [157]:
#Through which channel we acquired the client on the previous application

plt.figure(figsize=[20,10])
previous_application.CHANNEL_TYPE.value_counts(normalize=True).plot.barh(color="blue").set_title("Through which channel we acquired the client on the previous application",
                                                                                       fontsize=15,color="black", pad=30)
plt.show()
In [158]:
#since this column will not add value to our analysis, we are dropping it....

previous_application.drop(columns=['NAME_YIELD_GROUP'],inplace=True)
previous_application.shape
Out[158]:
(1670214, 10)

Merging both new and old dataframes

In [159]:
merged_df=pd.merge(left=application_data_up,right=previous_application,how='inner',on='SK_ID_CURR',suffixes='_x')
In [160]:
merged_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1413701 entries, 0 to 1413700
Data columns (total 57 columns):
 #   Column                       Non-Null Count    Dtype   
---  ------                       --------------    -----   
 0   SK_ID_CURR                   1413701 non-null  int64   
 1   TARGET                       1413701 non-null  int64   
 2   NAME_CONTRACT_TYPE           1413701 non-null  object  
 3   CODE_GENDER                  1413701 non-null  object  
 4   FLAG_OWN_CAR                 1413701 non-null  object  
 5   FLAG_OWN_REALTY              1413701 non-null  object  
 6   CNT_CHILDREN                 1413701 non-null  int64   
 7   AMT_INCOME_TOTAL             1413701 non-null  float64 
 8   AMT_CREDIT                   1413701 non-null  float64 
 9   AMT_ANNUITY                  1413701 non-null  float64 
 10  AMT_GOODS_PRICE              1413701 non-null  float64 
 11  NAME_TYPE_SUITE              1413701 non-null  object  
 12  NAME_INCOME_TYPE             1413701 non-null  object  
 13  NAME_EDUCATION_TYPE          1413701 non-null  object  
 14  NAME_FAMILY_STATUS           1413701 non-null  object  
 15  NAME_HOUSING_TYPE            1413701 non-null  object  
 16  REGION_POPULATION_RELATIVE   1413701 non-null  float64 
 17  DAYS_BIRTH                   1413701 non-null  int64   
 18  DAYS_EMPLOYED                1413701 non-null  float64 
 19  DAYS_REGISTRATION            1413701 non-null  int64   
 20  DAYS_ID_PUBLISH              1413701 non-null  int64   
 21  FLAG_MOBIL                   1413701 non-null  int64   
 22  FLAG_EMP_PHONE               1413701 non-null  int64   
 23  FLAG_WORK_PHONE              1413701 non-null  int64   
 24  FLAG_CONT_MOBILE             1413701 non-null  int64   
 25  FLAG_PHONE                   1413701 non-null  int64   
 26  FLAG_EMAIL                   1413701 non-null  int64   
 27  OCCUPATION_TYPE              1413701 non-null  object  
 28  CNT_FAM_MEMBERS              1413701 non-null  int64   
 29  REGION_RATING_CLIENT         1413701 non-null  int64   
 30  REGION_RATING_CLIENT_W_CITY  1413701 non-null  int64   
 31  WEEKDAY_APPR_PROCESS_START   1413701 non-null  object  
 32  HOUR_APPR_PROCESS_START      1413701 non-null  int64   
 33  REG_REGION_NOT_LIVE_REGION   1413701 non-null  int64   
 34  REG_REGION_NOT_WORK_REGION   1413701 non-null  int64   
 35  LIVE_REGION_NOT_WORK_REGION  1413701 non-null  int64   
 36  REG_CITY_NOT_LIVE_CITY       1413701 non-null  int64   
 37  REG_CITY_NOT_WORK_CITY       1413701 non-null  int64   
 38  LIVE_CITY_NOT_WORK_CITY      1413701 non-null  int64   
 39  ORGANIZATION_TYPE            1413701 non-null  object  
 40  EXT_SOURCE_2                 1413701 non-null  float64 
 41  EXT_SOURCE_3                 1413701 non-null  float64 
 42  DAYS_LAST_PHONE_CHANGE       1413701 non-null  int64   
 43  INCOME_SLAB                  1413701 non-null  category
 44  AMT_CREDIT_slab              1413701 non-null  category
 45  AGE                          1413701 non-null  int64   
 46  AGE_BINS                     1413697 non-null  category
 47  CREDIT_RATIO                 1413701 non-null  int64   
 48  AMT_APPLICATION              1413701 non-null  float64 
 49  NAME_CONTRACT_STATUS         1413701 non-null  object  
 50  DAYS_DECISION                1413701 non-null  int64   
 51  NAME_PAYMENT_TYPE            1413701 non-null  object  
 52  CODE_REJECT_REASON           1413701 non-null  object  
 53  NAME_CLIENT_TYPE             1413701 non-null  object  
 54  NAME_PORTFOLIO               1413701 non-null  object  
 55  NAME_PRODUCT_TYPE            1413701 non-null  object  
 56  CHANNEL_TYPE                 1413701 non-null  object  
dtypes: category(3), float64(9), int64(26), object(19)
memory usage: 597.3+ MB

BIVARIATE/MULTIVARIATE ANALYSIS %of Loan Payment Difficulties

In [161]:
# %of Loan Payment Difficulties for NAME_CONTRACT_STATUS and NAME_CLIENT_TYPE

table=pd.pivot_table(merged_df, values='TARGET', index=['NAME_CLIENT_TYPE'],
                    columns=['NAME_CONTRACT_STATUS'], aggfunc=np.mean)
cm=sns.light_palette("blue", as_cmap=True)
table.style.background_gradient(cmap=cm)
Out[161]:
NAME_CONTRACT_STATUS Approved Canceled Refused Unused offer
NAME_CLIENT_TYPE        
New 0.088216 0.145205 0.110940 0.089448
Refreshed 0.065158 0.081098 0.117412 0.074324
Repeater 0.072144 0.091767 0.120596 0.083338
XNA 0.071264 0.101377 0.135714 0.103448
In [162]:
table.T.plot(kind='bar').set_ylabel('% of Loan-Payment Difficulties')

plt.title('% of Loan Payment Difficulties for NAME_CONTRACT_STATUS and NAME_CLIENT_TYPE',fontdict={'fontsize':18},pad=20)

plt.show()
In [163]:
# %of Loan Payment Difficulties for NAME_CONTRACT_STATUS and NAME_CLIENT_TYPE

table=pd.pivot_table(merged_df, values='TARGET', index=['NAME_CONTRACT_TYPE'],
                    columns=['NAME_CONTRACT_STATUS'], aggfunc=np.mean)
cm=sns.light_palette("blue", as_cmap=True)
table.style.background_gradient(cmap=cm)
Out[163]:
NAME_CONTRACT_STATUS Approved Canceled Refused Unused offer
NAME_CONTRACT_TYPE        
Cash loans 0.078105 0.094178 0.123735 0.084637
Revolving loans 0.049836 0.058751 0.069429 0.061972
In [164]:
table.T.plot(kind='bar').set_ylabel('% of Loan-Payment Difficulties')

plt.title('% of Loan Payment Difficulties for NAME_CONTRACT_STATUS and NAME_CLIENT_TYPE',fontdict={'fontsize':18},pad=20)

plt.show()
In [165]:
#NAME_HOUSING_TYPE Vs NAME_CONTRACT_STATUS

plt.figure(figsize=[20,10])
sns.countplot(data=merged_df, y='NAME_HOUSING_TYPE', order=merged_df['NAME_HOUSING_TYPE'].value_counts().index,
             hue='NAME_CONTRACT_STATUS').set_title('NAME_HOUSING_TYPE Vs NAME_CONTRACT_STATUS',
                                               fontsize=30, color='blue',pad= 20)
plt.ylabel("NAME_HOUSING_TYPE",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})

plt.show()
In [166]:
#CODE_GENDER Vs NAME_CONTRACT_STATUS

plt.figure(figsize=[20,10])
sns.countplot(data=merged_df, y='CODE_GENDER', order=merged_df['CODE_GENDER'].value_counts().index,
             hue='NAME_CONTRACT_STATUS').set_title('CODE_GENDER Vs NAME_CONTRACT_STATUS',
                                               fontsize=30, color='blue',pad= 20)
plt.ylabel("CODE_GENDER",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})

plt.show()
In [167]:
#NAME_EDUCATION_TYPE Vs NAME_CONTRACT_STATUS

plt.figure(figsize=[20,10])

sns.countplot(data=merged_df, y='NAME_EDUCATION_TYPE', order=merged_df['NAME_EDUCATION_TYPE'].value_counts().index,
             hue='NAME_CONTRACT_STATUS',palette='crest').set_title('NAME_EDUCATION_TYPE Vs NAME_CONTRACT_STATUS',
                                               fontsize=30, color='blue',pad= 20)


plt.ylabel("NAME_EDUCATION_TYPE",fontdict={'fontsize':25,'fontweight':5,'color':'blue'})

plt.show()
In [168]:
#AMT_CREDIT_SLAB Vs NAME_CONTRACT_STATUS

plt.figure(figsize=[20,10])

sns.countplot(data=merged_df, y='AMT_CREDIT_slab', order=merged_df['AMT_CREDIT_slab'].value_counts().index,
             hue='NAME_CONTRACT_STATUS',palette='icefire').set_title('AMT_CREDIT_slab Vs NAME_CONTRACT_STATUS',
                                               fontsize=30, color='Blue',pad= 20)


plt.ylabel("AMT_CREDIT_slab",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})

plt.show()
In [169]:
#AGE_BINS Vs NAME_CONTRACT_STATUS

plt.figure(figsize=[20,10])

sns.countplot(data=merged_df, y='AGE_BINS', order=merged_df['AGE_BINS'].value_counts().index,
             hue='NAME_CONTRACT_STATUS',palette='cubehelix').set_title('AGE_BINS Vs NAME_CONTRACT_STATUS',
                                               fontsize=30, color='blue',pad= 20)


plt.ylabel("NAME_EDUCATION_TYPE",fontdict={'fontsize':25,'fontweight':5,'color':'blue'})

plt.show()
In [170]:
#INCOME_SLAB Vs NAME_CONTRACT_STATUS

plt.figure(figsize=[20,10])

sns.countplot(data=merged_df, y='INCOME_SLAB', order=merged_df['INCOME_SLAB'].value_counts().index,
             hue='NAME_CONTRACT_STATUS',palette='Paired').set_title('INCOME_SLAB Vs NAME_CONTRACT_STATUS',
                                               fontsize=30, color='Blue',pad= 20)


plt.ylabel("INCOME_SLAB",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})

plt.show()
In [171]:
#Contract type Vs AMT_CREDIT

plt.figure(figsize=[20,10])
sns.boxenplot(x=merged_df['NAME_CONTRACT_STATUS'], y=merged_df['AMT_CREDIT'], hue=merged_df['TARGET'],
             palette='viridis').set_title("Contract Vs AMT_CREDIT", fontsize=30,color='blue',pad=20)
plt.xlabel("NAME_CONTRACT_STATUS",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})
plt.ylabel("AMT_CREDIT",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})

plt.show()
In [172]:
#INCOME_SLAB Vs AMT_CREDIT

plt.figure(figsize=[20,10])
sns.boxenplot(x=merged_df['INCOME_SLAB'], y=merged_df['AMT_CREDIT'], hue=merged_df['TARGET'],
             palette='viridis').set_title("INCOME_SLAB Vs AMT_CREDIT", fontsize=30,color='blue',pad=20)
plt.xlabel("INCOME_SLAB",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})
plt.ylabel("AMT_CREDIT",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})

plt.show()
In [173]:
#NAME_INCOME_TYPE Vs AMT_CREDIT

plt.figure(figsize=[20,10])
sns.boxenplot(x=merged_df['NAME_INCOME_TYPE'], y=merged_df['AMT_CREDIT'], hue=merged_df['TARGET'],
             palette='flare').set_title("NAME_INCOME_TYPE Vs AMT_CREDIT", fontsize=30,color='blue',pad=20)
plt.xlabel("NAME_INCOME_TYPE",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})
plt.ylabel("AMT_CREDIT",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})

plt.show()
In [174]:
#NAME_CLIENT_TYPE Vs AMT_CREDIT

plt.figure(figsize=[20,10])
sns.boxenplot(x=merged_df['NAME_CLIENT_TYPE'], y=merged_df['AMT_CREDIT'], hue=merged_df['TARGET'],
             palette='light:#5A9').set_title("NAME_CLIENT_TYPE Vs AMT_CREDIT", fontsize=30,color='blue',pad=20)
plt.xlabel("NAME_CLIENT_TYPE",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})
plt.ylabel("AMT_CREDIT",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})

plt.show()